net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_tw_reuse __read_mostly;
  88 int sysctl_tcp_low_latency __read_mostly;
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 101                                           ip_hdr(skb)->saddr,
 102                                           tcp_hdr(skb)->dest,
 103                                           tcp_hdr(skb)->source);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 109         struct tcp_sock *tp = tcp_sk(sk);
 110
 111         /* With PAWS, it is safe from the viewpoint
 112            of data integrity. Even without PAWS it is safe provided sequence
 113            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 114
 115            Actually, the idea is close to VJ's one, only timestamp cache is
 116            held not per host, but per port pair and TW bucket is used as state
 117            holder.
 118
 119            If TW bucket has been already destroyed we fall back to VJ's scheme
 120            and use initial timestamp retrieved from peer table.
 121          */
 122         if (tcptw->tw_ts_recent_stamp &&
 123             (!twp || (sysctl_tcp_tw_reuse &&
 124                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 125                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 126                 if (tp->write_seq == 0)
 127                         tp->write_seq = 1;
 128                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 129                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 130                 sock_hold(sktw);
 131                 return 1;
 132         }
 133
 134         return 0;
 135 }
 136 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 137
 138 /* This will initiate an outgoing connection. */
 139 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 140 {
 141         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 142         struct inet_sock *inet = inet_sk(sk);
 143         struct tcp_sock *tp = tcp_sk(sk);
 144         __be16 orig_sport, orig_dport;
 145         __be32 daddr, nexthop;
 146         struct flowi4 *fl4;
 147         struct rtable *rt;
 148         int err;
 149         struct ip_options_rcu *inet_opt;
 150
 151         if (addr_len < sizeof(struct sockaddr_in))
 152                 return -EINVAL;
 153
 154         if (usin->sin_family != AF_INET)
 155                 return -EAFNOSUPPORT;
 156
 157         nexthop = daddr = usin->sin_addr.s_addr;
 158         inet_opt = rcu_dereference_protected(inet->inet_opt,
 159                                              lockdep_sock_is_held(sk));
 160         if (inet_opt && inet_opt->opt.srr) {
 161                 if (!daddr)
 162                         return -EINVAL;
 163                 nexthop = inet_opt->opt.faddr;
 164         }
 165
 166         orig_sport = inet->inet_sport;
 167         orig_dport = usin->sin_port;
 168         fl4 = &inet->cork.fl.u.ip4;
 169         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 170                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 171                               IPPROTO_TCP,
 172                               orig_sport, orig_dport, sk);
 173         if (IS_ERR(rt)) {
 174                 err = PTR_ERR(rt);
 175                 if (err == -ENETUNREACH)
 176                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 177                 return err;
 178         }
 179
 180         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 181                 ip_rt_put(rt);
 182                 return -ENETUNREACH;
 183         }
 184
 185         if (!inet_opt || !inet_opt->opt.srr)
 186                 daddr = fl4->daddr;
 187
 188         if (!inet->inet_saddr)
 189                 inet->inet_saddr = fl4->saddr;
 190         sk_rcv_saddr_set(sk, inet->inet_saddr);
 191
 192         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 193                 /* Reset inherited state */
 194                 tp->rx_opt.ts_recent       = 0;
 195                 tp->rx_opt.ts_recent_stamp = 0;
 196                 if (likely(!tp->repair))
 197                         tp->write_seq      = 0;
 198         }
 199
 200         if (tcp_death_row.sysctl_tw_recycle &&
 201             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 202                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 203
 204         inet->inet_dport = usin->sin_port;
 205         sk_daddr_set(sk, daddr);
 206
 207         inet_csk(sk)->icsk_ext_hdr_len = 0;
 208         if (inet_opt)
 209                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 210
 211         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 212
 213         /* Socket identity is still unknown (sport may be zero).
 214          * However we set state to SYN-SENT and not releasing socket
 215          * lock select source port, enter ourselves into the hash tables and
 216          * complete initialization after this.
 217          */
 218         tcp_set_state(sk, TCP_SYN_SENT);
 219         err = inet_hash_connect(&tcp_death_row, sk);
 220         if (err)
 221                 goto failure;
 222
 223         sk_set_txhash(sk);
 224
 225         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 226                                inet->inet_sport, inet->inet_dport, sk);
 227         if (IS_ERR(rt)) {
 228                 err = PTR_ERR(rt);
 229                 rt = NULL;
 230                 goto failure;
 231         }
 232         /* OK, now commit destination to socket.  */
 233         sk->sk_gso_type = SKB_GSO_TCPV4;
 234         sk_setup_caps(sk, &rt->dst);
 235
 236         if (!tp->write_seq && likely(!tp->repair))
 237                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 238                                                            inet->inet_daddr,
 239                                                            inet->inet_sport,
 240                                                            usin->sin_port);
 241
 242         inet->inet_id = tp->write_seq ^ jiffies;
 243
 244         err = tcp_connect(sk);
 245
 246         rt = NULL;
 247         if (err)
 248                 goto failure;
 249
 250         return 0;
 251
 252 failure:
 253         /*
 254          * This unhashes the socket and releases the local port,
 255          * if necessary.
 256          */
 257         tcp_set_state(sk, TCP_CLOSE);
 258         ip_rt_put(rt);
 259         sk->sk_route_caps = 0;
 260         inet->inet_dport = 0;
 261         return err;
 262 }
 263 EXPORT_SYMBOL(tcp_v4_connect);
 264
 265 /*
 266  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 267  * It can be called through tcp_release_cb() if socket was owned by user
 268  * at the time tcp_v4_err() was called to handle ICMP message.
 269  */
 270 void tcp_v4_mtu_reduced(struct sock *sk)
 271 {
 272         struct inet_sock *inet = inet_sk(sk);
 273         struct dst_entry *dst;
 274         u32 mtu;
 275
 276         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 277                 return;
 278         mtu = tcp_sk(sk)->mtu_info;
 279         dst = inet_csk_update_pmtu(sk, mtu);
 280         if (!dst)
 281                 return;
 282
 283         /* Something is about to be wrong... Remember soft error
 284          * for the case, if this connection will not able to recover.
 285          */
 286         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 287                 sk->sk_err_soft = EMSGSIZE;
 288
 289         mtu = dst_mtu(dst);
 290
 291         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 292             ip_sk_accept_pmtu(sk) &&
 293             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 294                 tcp_sync_mss(sk, mtu);
 295
 296                 /* Resend the TCP packet because it's
 297                  * clear that the old packet has been
 298                  * dropped. This is the new "fast" path mtu
 299                  * discovery.
 300                  */
 301                 tcp_simple_retransmit(sk);
 302         } /* else let the usual retransmit timer handle it */
 303 }
 304 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 305
 306 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 307 {
 308         struct dst_entry *dst = __sk_dst_check(sk, 0);
 309
 310         if (dst)
 311                 dst->ops->redirect(dst, sk, skb);
 312 }
 313
 314
 315 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 316 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 317 {
 318         struct request_sock *req = inet_reqsk(sk);
 319         struct net *net = sock_net(sk);
 320
 321         /* ICMPs are not backlogged, hence we cannot get
 322          * an established socket here.
 323          */
 324         if (seq != tcp_rsk(req)->snt_isn) {
 325                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 326         } else if (abort) {
 327                 /*
 328                  * Still in SYN_RECV, just remove it silently.
 329                  * There is no good way to pass the error to the newly
 330                  * created socket, and POSIX does not want network
 331                  * errors returned from accept().
 332                  */
 333                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 334                 tcp_listendrop(req->rsk_listener);
 335         }
 336         reqsk_put(req);
 337 }
 338 EXPORT_SYMBOL(tcp_req_err);
 339
 340 /*
 341  * This routine is called by the ICMP module when it gets some
 342  * sort of error condition.  If err < 0 then the socket should
 343  * be closed and the error returned to the user.  If err > 0
 344  * it's just the icmp type << 8 | icmp code.  After adjustment
 345  * header points to the first 8 bytes of the tcp header.  We need
 346  * to find the appropriate port.
 347  *
 348  * The locking strategy used here is very "optimistic". When
 349  * someone else accesses the socket the ICMP is just dropped
 350  * and for some paths there is no check at all.
 351  * A more general error queue to queue errors for later handling
 352  * is probably better.
 353  *
 354  */
 355
 356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 357 {
 358         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 359         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 360         struct inet_connection_sock *icsk;
 361         struct tcp_sock *tp;
 362         struct inet_sock *inet;
 363         const int type = icmp_hdr(icmp_skb)->type;
 364         const int code = icmp_hdr(icmp_skb)->code;
 365         struct sock *sk;
 366         struct sk_buff *skb;
 367         struct request_sock *fastopen;
 368         __u32 seq, snd_una;
 369         __u32 remaining;
 370         int err;
 371         struct net *net = dev_net(icmp_skb->dev);
 372
 373         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 374                                        th->dest, iph->saddr, ntohs(th->source),
 375                                        inet_iif(icmp_skb));
 376         if (!sk) {
 377                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 378                 return;
 379         }
 380         if (sk->sk_state == TCP_TIME_WAIT) {
 381                 inet_twsk_put(inet_twsk(sk));
 382                 return;
 383         }
 384         seq = ntohl(th->seq);
 385         if (sk->sk_state == TCP_NEW_SYN_RECV)
 386                 return tcp_req_err(sk, seq,
 387                                   type == ICMP_PARAMETERPROB ||
 388                                   type == ICMP_TIME_EXCEEDED ||
 389                                   (type == ICMP_DEST_UNREACH &&
 390                                    (code == ICMP_NET_UNREACH ||
 391                                     code == ICMP_HOST_UNREACH)));
 392
 393         bh_lock_sock(sk);
 394         /* If too many ICMPs get dropped on busy
 395          * servers this needs to be solved differently.
 396          * We do take care of PMTU discovery (RFC1191) special case :
 397          * we can receive locally generated ICMP messages while socket is held.
 398          */
 399         if (sock_owned_by_user(sk)) {
 400                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 401                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 402         }
 403         if (sk->sk_state == TCP_CLOSE)
 404                 goto out;
 405
 406         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 407                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 408                 goto out;
 409         }
 410
 411         icsk = inet_csk(sk);
 412         tp = tcp_sk(sk);
 413         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 414         fastopen = tp->fastopen_rsk;
 415         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 416         if (sk->sk_state != TCP_LISTEN &&
 417             !between(seq, snd_una, tp->snd_nxt)) {
 418                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 419                 goto out;
 420         }
 421
 422         switch (type) {
 423         case ICMP_REDIRECT:
 424                 if (!sock_owned_by_user(sk))
 425                         do_redirect(icmp_skb, sk);
 426                 goto out;
 427         case ICMP_SOURCE_QUENCH:
 428                 /* Just silently ignore these. */
 429                 goto out;
 430         case ICMP_PARAMETERPROB:
 431                 err = EPROTO;
 432                 break;
 433         case ICMP_DEST_UNREACH:
 434                 if (code > NR_ICMP_UNREACH)
 435                         goto out;
 436
 437                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 438                         /* We are not interested in TCP_LISTEN and open_requests
 439                          * (SYN-ACKs send out by Linux are always <576bytes so
 440                          * they should go through unfragmented).
 441                          */
 442                         if (sk->sk_state == TCP_LISTEN)
 443                                 goto out;
 444
 445                         tp->mtu_info = info;
 446                         if (!sock_owned_by_user(sk)) {
 447                                 tcp_v4_mtu_reduced(sk);
 448                         } else {
 449                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 450                                         sock_hold(sk);
 451                         }
 452                         goto out;
 453                 }
 454
 455                 err = icmp_err_convert[code].errno;
 456                 /* check if icmp_skb allows revert of backoff
 457                  * (see draft-zimmermann-tcp-lcd) */
 458                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 459                         break;
 460                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 461                     !icsk->icsk_backoff || fastopen)
 462                         break;
 463
 464                 if (sock_owned_by_user(sk))
 465                         break;
 466
 467                 skb = tcp_write_queue_head(sk);
 468                 if (WARN_ON_ONCE(!skb))
 469                         break;
 470
 471                 icsk->icsk_backoff--;
 472                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 473                                                TCP_TIMEOUT_INIT;
 474                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 475
 476                 remaining = icsk->icsk_rto -
 477                             min(icsk->icsk_rto,
 478                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 479
 480                 if (remaining) {
 481                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 482                                                   remaining, TCP_RTO_MAX);
 483                 } else {
 484                         /* RTO revert clocked out retransmission.
 485                          * Will retransmit now */
 486                         tcp_retransmit_timer(sk);
 487                 }
 488
 489                 break;
 490         case ICMP_TIME_EXCEEDED:
 491                 err = EHOSTUNREACH;
 492                 break;
 493         default:
 494                 goto out;
 495         }
 496
 497         switch (sk->sk_state) {
 498         case TCP_SYN_SENT:
 499         case TCP_SYN_RECV:
 500                 /* Only in fast or simultaneous open. If a fast open socket is
 501                  * is already accepted it is treated as a connected one below.
 502                  */
 503                 if (fastopen && !fastopen->sk)
 504                         break;
 505
 506                 if (!sock_owned_by_user(sk)) {
 507                         sk->sk_err = err;
 508
 509                         sk->sk_error_report(sk);
 510
 511                         tcp_done(sk);
 512                 } else {
 513                         sk->sk_err_soft = err;
 514                 }
 515                 goto out;
 516         }
 517
 518         /* If we've already connected we will keep trying
 519          * until we time out, or the user gives up.
 520          *
 521          * rfc1122 4.2.3.9 allows to consider as hard errors
 522          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 523          * but it is obsoleted by pmtu discovery).
 524          *
 525          * Note, that in modern internet, where routing is unreliable
 526          * and in each dark corner broken firewalls sit, sending random
 527          * errors ordered by their masters even this two messages finally lose
 528          * their original sense (even Linux sends invalid PORT_UNREACHs)
 529          *
 530          * Now we are in compliance with RFCs.
 531          *                                                      --ANK (980905)
 532          */
 533
 534         inet = inet_sk(sk);
 535         if (!sock_owned_by_user(sk) && inet->recverr) {
 536                 sk->sk_err = err;
 537                 sk->sk_error_report(sk);
 538         } else  { /* Only an error on timeout */
 539                 sk->sk_err_soft = err;
 540         }
 541
 542 out:
 543         bh_unlock_sock(sk);
 544         sock_put(sk);
 545 }
 546
 547 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 548 {
 549         struct tcphdr *th = tcp_hdr(skb);
 550
 551         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 552                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 553                 skb->csum_start = skb_transport_header(skb) - skb->head;
 554                 skb->csum_offset = offsetof(struct tcphdr, check);
 555         } else {
 556                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 557                                          csum_partial(th,
 558                                                       th->doff << 2,
 559                                                       skb->csum));
 560         }
 561 }
 562
 563 /* This routine computes an IPv4 TCP checksum. */
 564 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 565 {
 566         const struct inet_sock *inet = inet_sk(sk);
 567
 568         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 569 }
 570 EXPORT_SYMBOL(tcp_v4_send_check);
 571
 572 /*
 573  *      This routine will send an RST to the other tcp.
 574  *
 575  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 576  *                    for reset.
 577  *      Answer: if a packet caused RST, it is not for a socket
 578  *              existing in our system, if it is matched to a socket,
 579  *              it is just duplicate segment or bug in other side's TCP.
 580  *              So that we build reply only basing on parameters
 581  *              arrived with segment.
 582  *      Exception: precedence violation. We do not implement it in any case.
 583  */
 584
 585 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 586 {
 587         const struct tcphdr *th = tcp_hdr(skb);
 588         struct {
 589                 struct tcphdr th;
 590 #ifdef CONFIG_TCP_MD5SIG
 591                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 592 #endif
 593         } rep;
 594         struct ip_reply_arg arg;
 595 #ifdef CONFIG_TCP_MD5SIG
 596         struct tcp_md5sig_key *key = NULL;
 597         const __u8 *hash_location = NULL;
 598         unsigned char newhash[16];
 599         int genhash;
 600         struct sock *sk1 = NULL;
 601 #endif
 602         struct net *net;
 603
 604         /* Never send a reset in response to a reset. */
 605         if (th->rst)
 606                 return;
 607
 608         /* If sk not NULL, it means we did a successful lookup and incoming
 609          * route had to be correct. prequeue might have dropped our dst.
 610          */
 611         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 612                 return;
 613
 614         /* Swap the send and the receive. */
 615         memset(&rep, 0, sizeof(rep));
 616         rep.th.dest   = th->source;
 617         rep.th.source = th->dest;
 618         rep.th.doff   = sizeof(struct tcphdr) / 4;
 619         rep.th.rst    = 1;
 620
 621         if (th->ack) {
 622                 rep.th.seq = th->ack_seq;
 623         } else {
 624                 rep.th.ack = 1;
 625                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 626                                        skb->len - (th->doff << 2));
 627         }
 628
 629         memset(&arg, 0, sizeof(arg));
 630         arg.iov[0].iov_base = (unsigned char *)&rep;
 631         arg.iov[0].iov_len  = sizeof(rep.th);
 632
 633         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 634 #ifdef CONFIG_TCP_MD5SIG
 635         rcu_read_lock();
 636         hash_location = tcp_parse_md5sig_option(th);
 637         if (sk && sk_fullsock(sk)) {
 638                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 639                                         &ip_hdr(skb)->saddr, AF_INET);
 640         } else if (hash_location) {
 641                 /*
 642                  * active side is lost. Try to find listening socket through
 643                  * source port, and then find md5 key through listening socket.
 644                  * we are not loose security here:
 645                  * Incoming packet is checked with md5 hash with finding key,
 646                  * no RST generated if md5 hash doesn't match.
 647                  */
 648                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 649                                              ip_hdr(skb)->saddr,
 650                                              th->source, ip_hdr(skb)->daddr,
 651                                              ntohs(th->source), inet_iif(skb));
 652                 /* don't send rst if it can't find key */
 653                 if (!sk1)
 654                         goto out;
 655
 656                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 657                                         &ip_hdr(skb)->saddr, AF_INET);
 658                 if (!key)
 659                         goto out;
 660
 661
 662                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 663                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 664                         goto out;
 665
 666         }
 667
 668         if (key) {
 669                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 670                                    (TCPOPT_NOP << 16) |
 671                                    (TCPOPT_MD5SIG << 8) |
 672                                    TCPOLEN_MD5SIG);
 673                 /* Update length and the length the header thinks exists */
 674                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 675                 rep.th.doff = arg.iov[0].iov_len / 4;
 676
 677                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 678                                      key, ip_hdr(skb)->saddr,
 679                                      ip_hdr(skb)->daddr, &rep.th);
 680         }
 681 #endif
 682         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 683                                       ip_hdr(skb)->saddr, /* XXX */
 684                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 685         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 686         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 687
 688         /* When socket is gone, all binding information is lost.
 689          * routing might fail in this case. No choice here, if we choose to force
 690          * input interface, we will misroute in case of asymmetric route.
 691          */
 692         if (sk)
 693                 arg.bound_dev_if = sk->sk_bound_dev_if;
 694
 695         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 696                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 697
 698         arg.tos = ip_hdr(skb)->tos;
 699         local_bh_disable();
 700         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 701                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 702                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 703                               &arg, arg.iov[0].iov_len);
 704
 705         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 706         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 707         local_bh_enable();
 708
 709 #ifdef CONFIG_TCP_MD5SIG
 710 out:
 711         rcu_read_unlock();
 712 #endif
 713 }
 714
 715 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 716    outside socket context is ugly, certainly. What can I do?
 717  */
 718
 719 static void tcp_v4_send_ack(struct net *net,
 720                             struct sk_buff *skb, u32 seq, u32 ack,
 721                             u32 win, u32 tsval, u32 tsecr, int oif,
 722                             struct tcp_md5sig_key *key,
 723                             int reply_flags, u8 tos)
 724 {
 725         const struct tcphdr *th = tcp_hdr(skb);
 726         struct {
 727                 struct tcphdr th;
 728                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 729 #ifdef CONFIG_TCP_MD5SIG
 730                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 731 #endif
 732                         ];
 733         } rep;
 734         struct ip_reply_arg arg;
 735
 736         memset(&rep.th, 0, sizeof(struct tcphdr));
 737         memset(&arg, 0, sizeof(arg));
 738
 739         arg.iov[0].iov_base = (unsigned char *)&rep;
 740         arg.iov[0].iov_len  = sizeof(rep.th);
 741         if (tsecr) {
 742                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 743                                    (TCPOPT_TIMESTAMP << 8) |
 744                                    TCPOLEN_TIMESTAMP);
 745                 rep.opt[1] = htonl(tsval);
 746                 rep.opt[2] = htonl(tsecr);
 747                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 748         }
 749
 750         /* Swap the send and the receive. */
 751         rep.th.dest    = th->source;
 752         rep.th.source  = th->dest;
 753         rep.th.doff    = arg.iov[0].iov_len / 4;
 754         rep.th.seq     = htonl(seq);
 755         rep.th.ack_seq = htonl(ack);
 756         rep.th.ack     = 1;
 757         rep.th.window  = htons(win);
 758
 759 #ifdef CONFIG_TCP_MD5SIG
 760         if (key) {
 761                 int offset = (tsecr) ? 3 : 0;
 762
 763                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 764                                           (TCPOPT_NOP << 16) |
 765                                           (TCPOPT_MD5SIG << 8) |
 766                                           TCPOLEN_MD5SIG);
 767                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 768                 rep.th.doff = arg.iov[0].iov_len/4;
 769
 770                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 771                                     key, ip_hdr(skb)->saddr,
 772                                     ip_hdr(skb)->daddr, &rep.th);
 773         }
 774 #endif
 775         arg.flags = reply_flags;
 776         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 777                                       ip_hdr(skb)->saddr, /* XXX */
 778                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 779         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 780         if (oif)
 781                 arg.bound_dev_if = oif;
 782         arg.tos = tos;
 783         local_bh_disable();
 784         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 785                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 786                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 787                               &arg, arg.iov[0].iov_len);
 788
 789         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 790         local_bh_enable();
 791 }
 792
 793 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 794 {
 795         struct inet_timewait_sock *tw = inet_twsk(sk);
 796         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 797
 798         tcp_v4_send_ack(sock_net(sk), skb,
 799                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 800                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 801                         tcp_time_stamp + tcptw->tw_ts_offset,
 802                         tcptw->tw_ts_recent,
 803                         tw->tw_bound_dev_if,
 804                         tcp_twsk_md5_key(tcptw),
 805                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 806                         tw->tw_tos
 807                         );
 808
 809         inet_twsk_put(tw);
 810 }
 811
 812 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 813                                   struct request_sock *req)
 814 {
 815         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 816          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 817          */
 818         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 819                                              tcp_sk(sk)->snd_nxt;
 820
 821         /* RFC 7323 2.3
 822          * The window field (SEG.WND) of every outgoing segment, with the
 823          * exception of <SYN> segments, MUST be right-shifted by
 824          * Rcv.Wind.Shift bits:
 825          */
 826         tcp_v4_send_ack(sock_net(sk), skb, seq,
 827                         tcp_rsk(req)->rcv_nxt,
 828                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 829                         tcp_time_stamp,
 830                         req->ts_recent,
 831                         0,
 832                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 833                                           AF_INET),
 834                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 835                         ip_hdr(skb)->tos);
 836 }
 837
 838 /*
 839  *      Send a SYN-ACK after having received a SYN.
 840  *      This still operates on a request_sock only, not on a big
 841  *      socket.
 842  */
 843 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 844                               struct flowi *fl,
 845                               struct request_sock *req,
 846                               struct tcp_fastopen_cookie *foc,
 847                               enum tcp_synack_type synack_type)
 848 {
 849         const struct inet_request_sock *ireq = inet_rsk(req);
 850         struct flowi4 fl4;
 851         int err = -1;
 852         struct sk_buff *skb;
 853
 854         /* First, grab a route. */
 855         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 856                 return -1;
 857
 858         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 859
 860         if (skb) {
 861                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 862
 863                 rcu_read_lock();
 864                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 865                                             ireq->ir_rmt_addr,
 866                                             rcu_dereference(ireq->ireq_opt));
 867                 rcu_read_unlock();
 868                 err = net_xmit_eval(err);
 869         }
 870
 871         return err;
 872 }
 873
 874 /*
 875  *      IPv4 request_sock destructor.
 876  */
 877 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 878 {
 879         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 880 }
 881
 882 #ifdef CONFIG_TCP_MD5SIG
 883 /*
 884  * RFC2385 MD5 checksumming requires a mapping of
 885  * IP address->MD5 Key.
 886  * We need to maintain these in the sk structure.
 887  */
 888
 889 /* Find the Key structure for an address.  */
 890 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 891                                          const union tcp_md5_addr *addr,
 892                                          int family)
 893 {
 894         const struct tcp_sock *tp = tcp_sk(sk);
 895         struct tcp_md5sig_key *key;
 896         unsigned int size = sizeof(struct in_addr);
 897         const struct tcp_md5sig_info *md5sig;
 898
 899         /* caller either holds rcu_read_lock() or socket lock */
 900         md5sig = rcu_dereference_check(tp->md5sig_info,
 901                                        lockdep_sock_is_held(sk));
 902         if (!md5sig)
 903                 return NULL;
 904 #if IS_ENABLED(CONFIG_IPV6)
 905         if (family == AF_INET6)
 906                 size = sizeof(struct in6_addr);
 907 #endif
 908         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 909                 if (key->family != family)
 910                         continue;
 911                 if (!memcmp(&key->addr, addr, size))
 912                         return key;
 913         }
 914         return NULL;
 915 }
 916 EXPORT_SYMBOL(tcp_md5_do_lookup);
 917
 918 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 919                                          const struct sock *addr_sk)
 920 {
 921         const union tcp_md5_addr *addr;
 922
 923         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 924         return tcp_md5_do_lookup(sk, addr, AF_INET);
 925 }
 926 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 927
 928 /* This can be called on a newly created socket, from other files */
 929 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 930                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 931 {
 932         /* Add Key to the list */
 933         struct tcp_md5sig_key *key;
 934         struct tcp_sock *tp = tcp_sk(sk);
 935         struct tcp_md5sig_info *md5sig;
 936
 937         key = tcp_md5_do_lookup(sk, addr, family);
 938         if (key) {
 939                 /* Pre-existing entry - just update that one. */
 940                 memcpy(key->key, newkey, newkeylen);
 941                 key->keylen = newkeylen;
 942                 return 0;
 943         }
 944
 945         md5sig = rcu_dereference_protected(tp->md5sig_info,
 946                                            lockdep_sock_is_held(sk));
 947         if (!md5sig) {
 948                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 949                 if (!md5sig)
 950                         return -ENOMEM;
 951
 952                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 953                 INIT_HLIST_HEAD(&md5sig->head);
 954                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 955         }
 956
 957         key = sock_kmalloc(sk, sizeof(*key), gfp);
 958         if (!key)
 959                 return -ENOMEM;
 960         if (!tcp_alloc_md5sig_pool()) {
 961                 sock_kfree_s(sk, key, sizeof(*key));
 962                 return -ENOMEM;
 963         }
 964
 965         memcpy(key->key, newkey, newkeylen);
 966         key->keylen = newkeylen;
 967         key->family = family;
 968         memcpy(&key->addr, addr,
 969                (family == AF_INET6) ? sizeof(struct in6_addr) :
 970                                       sizeof(struct in_addr));
 971         hlist_add_head_rcu(&key->node, &md5sig->head);
 972         return 0;
 973 }
 974 EXPORT_SYMBOL(tcp_md5_do_add);
 975
 976 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 977 {
 978         struct tcp_md5sig_key *key;
 979
 980         key = tcp_md5_do_lookup(sk, addr, family);
 981         if (!key)
 982                 return -ENOENT;
 983         hlist_del_rcu(&key->node);
 984         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 985         kfree_rcu(key, rcu);
 986         return 0;
 987 }
 988 EXPORT_SYMBOL(tcp_md5_do_del);
 989
 990 static void tcp_clear_md5_list(struct sock *sk)
 991 {
 992         struct tcp_sock *tp = tcp_sk(sk);
 993         struct tcp_md5sig_key *key;
 994         struct hlist_node *n;
 995         struct tcp_md5sig_info *md5sig;
 996
 997         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 998
 999         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1000                 hlist_del_rcu(&key->node);
1001                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1002                 kfree_rcu(key, rcu);
1003         }
1004 }
1005
1006 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1007                                  int optlen)
1008 {
1009         struct tcp_md5sig cmd;
1010         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1011
1012         if (optlen < sizeof(cmd))
1013                 return -EINVAL;
1014
1015         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1016                 return -EFAULT;
1017
1018         if (sin->sin_family != AF_INET)
1019                 return -EINVAL;
1020
1021         if (!cmd.tcpm_keylen)
1022                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1023                                       AF_INET);
1024
1025         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1026                 return -EINVAL;
1027
1028         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1029                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1030                               GFP_KERNEL);
1031 }
1032
1033 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1034                                    __be32 daddr, __be32 saddr,
1035                                    const struct tcphdr *th, int nbytes)
1036 {
1037         struct tcp4_pseudohdr *bp;
1038         struct scatterlist sg;
1039         struct tcphdr *_th;
1040
1041         bp = hp->scratch;
1042         bp->saddr = saddr;
1043         bp->daddr = daddr;
1044         bp->pad = 0;
1045         bp->protocol = IPPROTO_TCP;
1046         bp->len = cpu_to_be16(nbytes);
1047
1048         _th = (struct tcphdr *)(bp + 1);
1049         memcpy(_th, th, sizeof(*th));
1050         _th->check = 0;
1051
1052         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1053         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1054                                 sizeof(*bp) + sizeof(*th));
1055         return crypto_ahash_update(hp->md5_req);
1056 }
1057
1058 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1059                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1060 {
1061         struct tcp_md5sig_pool *hp;
1062         struct ahash_request *req;
1063
1064         hp = tcp_get_md5sig_pool();
1065         if (!hp)
1066                 goto clear_hash_noput;
1067         req = hp->md5_req;
1068
1069         if (crypto_ahash_init(req))
1070                 goto clear_hash;
1071         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1072                 goto clear_hash;
1073         if (tcp_md5_hash_key(hp, key))
1074                 goto clear_hash;
1075         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1076         if (crypto_ahash_final(req))
1077                 goto clear_hash;
1078
1079         tcp_put_md5sig_pool();
1080         return 0;
1081
1082 clear_hash:
1083         tcp_put_md5sig_pool();
1084 clear_hash_noput:
1085         memset(md5_hash, 0, 16);
1086         return 1;
1087 }
1088
1089 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1090                         const struct sock *sk,
1091                         const struct sk_buff *skb)
1092 {
1093         struct tcp_md5sig_pool *hp;
1094         struct ahash_request *req;
1095         const struct tcphdr *th = tcp_hdr(skb);
1096         __be32 saddr, daddr;
1097
1098         if (sk) { /* valid for establish/request sockets */
1099                 saddr = sk->sk_rcv_saddr;
1100                 daddr = sk->sk_daddr;
1101         } else {
1102                 const struct iphdr *iph = ip_hdr(skb);
1103                 saddr = iph->saddr;
1104                 daddr = iph->daddr;
1105         }
1106
1107         hp = tcp_get_md5sig_pool();
1108         if (!hp)
1109                 goto clear_hash_noput;
1110         req = hp->md5_req;
1111
1112         if (crypto_ahash_init(req))
1113                 goto clear_hash;
1114
1115         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1116                 goto clear_hash;
1117         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1118                 goto clear_hash;
1119         if (tcp_md5_hash_key(hp, key))
1120                 goto clear_hash;
1121         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1122         if (crypto_ahash_final(req))
1123                 goto clear_hash;
1124
1125         tcp_put_md5sig_pool();
1126         return 0;
1127
1128 clear_hash:
1129         tcp_put_md5sig_pool();
1130 clear_hash_noput:
1131         memset(md5_hash, 0, 16);
1132         return 1;
1133 }
1134 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1135
1136 #endif
1137
1138 /* Called with rcu_read_lock() */
1139 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1140                                     const struct sk_buff *skb)
1141 {
1142 #ifdef CONFIG_TCP_MD5SIG
1143         /*
1144          * This gets called for each TCP segment that arrives
1145          * so we want to be efficient.
1146          * We have 3 drop cases:
1147          * o No MD5 hash and one expected.
1148          * o MD5 hash and we're not expecting one.
1149          * o MD5 hash and its wrong.
1150          */
1151         const __u8 *hash_location = NULL;
1152         struct tcp_md5sig_key *hash_expected;
1153         const struct iphdr *iph = ip_hdr(skb);
1154         const struct tcphdr *th = tcp_hdr(skb);
1155         int genhash;
1156         unsigned char newhash[16];
1157
1158         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1159                                           AF_INET);
1160         hash_location = tcp_parse_md5sig_option(th);
1161
1162         /* We've parsed the options - do we have a hash? */
1163         if (!hash_expected && !hash_location)
1164                 return false;
1165
1166         if (hash_expected && !hash_location) {
1167                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1168                 return true;
1169         }
1170
1171         if (!hash_expected && hash_location) {
1172                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1173                 return true;
1174         }
1175
1176         /* Okay, so this is hash_expected and hash_location -
1177          * so we need to calculate the checksum.
1178          */
1179         genhash = tcp_v4_md5_hash_skb(newhash,
1180                                       hash_expected,
1181                                       NULL, skb);
1182
1183         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1184                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1185                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1186                                      &iph->saddr, ntohs(th->source),
1187                                      &iph->daddr, ntohs(th->dest),
1188                                      genhash ? " tcp_v4_calc_md5_hash failed"
1189                                      : "");
1190                 return true;
1191         }
1192         return false;
1193 #endif
1194         return false;
1195 }
1196
1197 static void tcp_v4_init_req(struct request_sock *req,
1198                             const struct sock *sk_listener,
1199                             struct sk_buff *skb)
1200 {
1201         struct inet_request_sock *ireq = inet_rsk(req);
1202
1203         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1204         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1205         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(skb));
1206 }
1207
1208 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1209                                           struct flowi *fl,
1210                                           const struct request_sock *req,
1211                                           bool *strict)
1212 {
1213         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1214
1215         if (strict) {
1216                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1217                         *strict = true;
1218                 else
1219                         *strict = false;
1220         }
1221
1222         return dst;
1223 }
1224
1225 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1226         .family         =       PF_INET,
1227         .obj_size       =       sizeof(struct tcp_request_sock),
1228         .rtx_syn_ack    =       tcp_rtx_synack,
1229         .send_ack       =       tcp_v4_reqsk_send_ack,
1230         .destructor     =       tcp_v4_reqsk_destructor,
1231         .send_reset     =       tcp_v4_send_reset,
1232         .syn_ack_timeout =      tcp_syn_ack_timeout,
1233 };
1234
1235 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1236         .mss_clamp      =       TCP_MSS_DEFAULT,
1237 #ifdef CONFIG_TCP_MD5SIG
1238         .req_md5_lookup =       tcp_v4_md5_lookup,
1239         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1240 #endif
1241         .init_req       =       tcp_v4_init_req,
1242 #ifdef CONFIG_SYN_COOKIES
1243         .cookie_init_seq =      cookie_v4_init_sequence,
1244 #endif
1245         .route_req      =       tcp_v4_route_req,
1246         .init_seq       =       tcp_v4_init_sequence,
1247         .send_synack    =       tcp_v4_send_synack,
1248 };
1249
1250 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1251 {
1252         /* Never answer to SYNs send to broadcast or multicast */
1253         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1254                 goto drop;
1255
1256         return tcp_conn_request(&tcp_request_sock_ops,
1257                                 &tcp_request_sock_ipv4_ops, sk, skb);
1258
1259 drop:
1260         tcp_listendrop(sk);
1261         return 0;
1262 }
1263 EXPORT_SYMBOL(tcp_v4_conn_request);
1264
1265
1266 /*
1267  * The three way handshake has completed - we got a valid synack -
1268  * now create the new socket.
1269  */
1270 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1271                                   struct request_sock *req,
1272                                   struct dst_entry *dst,
1273                                   struct request_sock *req_unhash,
1274                                   bool *own_req)
1275 {
1276         struct inet_request_sock *ireq;
1277         struct inet_sock *newinet;
1278         struct tcp_sock *newtp;
1279         struct sock *newsk;
1280 #ifdef CONFIG_TCP_MD5SIG
1281         struct tcp_md5sig_key *key;
1282 #endif
1283         struct ip_options_rcu *inet_opt;
1284
1285         if (sk_acceptq_is_full(sk))
1286                 goto exit_overflow;
1287
1288         newsk = tcp_create_openreq_child(sk, req, skb);
1289         if (!newsk)
1290                 goto exit_nonewsk;
1291
1292         newsk->sk_gso_type = SKB_GSO_TCPV4;
1293         inet_sk_rx_dst_set(newsk, skb);
1294
1295         newtp                 = tcp_sk(newsk);
1296         newinet               = inet_sk(newsk);
1297         ireq                  = inet_rsk(req);
1298         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1299         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1300         newsk->sk_bound_dev_if = ireq->ir_iif;
1301         newinet->inet_saddr   = ireq->ir_loc_addr;
1302         inet_opt              = rcu_dereference(ireq->ireq_opt);
1303         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1304         newinet->mc_index     = inet_iif(skb);
1305         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1306         newinet->rcv_tos      = ip_hdr(skb)->tos;
1307         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1308         if (inet_opt)
1309                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1310         newinet->inet_id = newtp->write_seq ^ jiffies;
1311
1312         if (!dst) {
1313                 dst = inet_csk_route_child_sock(sk, newsk, req);
1314                 if (!dst)
1315                         goto put_and_exit;
1316         } else {
1317                 /* syncookie case : see end of cookie_v4_check() */
1318         }
1319         sk_setup_caps(newsk, dst);
1320
1321         tcp_ca_openreq_child(newsk, dst);
1322
1323         tcp_sync_mss(newsk, dst_mtu(dst));
1324         newtp->advmss = dst_metric_advmss(dst);
1325         if (tcp_sk(sk)->rx_opt.user_mss &&
1326             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1327                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1328
1329         tcp_initialize_rcv_mss(newsk);
1330
1331 #ifdef CONFIG_TCP_MD5SIG
1332         /* Copy over the MD5 key from the original socket */
1333         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1334                                 AF_INET);
1335         if (key) {
1336                 /*
1337                  * We're using one, so create a matching key
1338                  * on the newsk structure. If we fail to get
1339                  * memory, then we end up not copying the key
1340                  * across. Shucks.
1341                  */
1342                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1343                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1344                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1345         }
1346 #endif
1347
1348         if (__inet_inherit_port(sk, newsk) < 0)
1349                 goto put_and_exit;
1350         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1351         if (likely(*own_req)) {
1352                 tcp_move_syn(newtp, req);
1353                 ireq->ireq_opt = NULL;
1354         } else {
1355                 newinet->inet_opt = NULL;
1356         }
1357         return newsk;
1358
1359 exit_overflow:
1360         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1361 exit_nonewsk:
1362         dst_release(dst);
1363 exit:
1364         tcp_listendrop(sk);
1365         return NULL;
1366 put_and_exit:
1367         newinet->inet_opt = NULL;
1368         inet_csk_prepare_forced_close(newsk);
1369         tcp_done(newsk);
1370         goto exit;
1371 }
1372 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1373
1374 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1375 {
1376 #ifdef CONFIG_SYN_COOKIES
1377         const struct tcphdr *th = tcp_hdr(skb);
1378
1379         if (!th->syn)
1380                 sk = cookie_v4_check(sk, skb);
1381 #endif
1382         return sk;
1383 }
1384
1385 /* The socket must have it's spinlock held when we get
1386  * here, unless it is a TCP_LISTEN socket.
1387  *
1388  * We have a potential double-lock case here, so even when
1389  * doing backlog processing we use the BH locking scheme.
1390  * This is because we cannot sleep with the original spinlock
1391  * held.
1392  */
1393 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1394 {
1395         struct sock *rsk;
1396
1397         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1398                 struct dst_entry *dst = sk->sk_rx_dst;
1399
1400                 sock_rps_save_rxhash(sk, skb);
1401                 sk_mark_napi_id(sk, skb);
1402                 if (dst) {
1403                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1404                             !dst->ops->check(dst, 0)) {
1405                                 dst_release(dst);
1406                                 sk->sk_rx_dst = NULL;
1407                         }
1408                 }
1409                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1410                 return 0;
1411         }
1412
1413         if (tcp_checksum_complete(skb))
1414                 goto csum_err;
1415
1416         if (sk->sk_state == TCP_LISTEN) {
1417                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1418
1419                 if (!nsk)
1420                         goto discard;
1421                 if (nsk != sk) {
1422                         sock_rps_save_rxhash(nsk, skb);
1423                         sk_mark_napi_id(nsk, skb);
1424                         if (tcp_child_process(sk, nsk, skb)) {
1425                                 rsk = nsk;
1426                                 goto reset;
1427                         }
1428                         return 0;
1429                 }
1430         } else
1431                 sock_rps_save_rxhash(sk, skb);
1432
1433         if (tcp_rcv_state_process(sk, skb)) {
1434                 rsk = sk;
1435                 goto reset;
1436         }
1437         return 0;
1438
1439 reset:
1440         tcp_v4_send_reset(rsk, skb);
1441 discard:
1442         kfree_skb(skb);
1443         /* Be careful here. If this function gets more complicated and
1444          * gcc suffers from register pressure on the x86, sk (in %ebx)
1445          * might be destroyed here. This current version compiles correctly,
1446          * but you have been warned.
1447          */
1448         return 0;
1449
1450 csum_err:
1451         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1452         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1453         goto discard;
1454 }
1455 EXPORT_SYMBOL(tcp_v4_do_rcv);
1456
1457 void tcp_v4_early_demux(struct sk_buff *skb)
1458 {
1459         const struct iphdr *iph;
1460         const struct tcphdr *th;
1461         struct sock *sk;
1462
1463         if (skb->pkt_type != PACKET_HOST)
1464                 return;
1465
1466         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1467                 return;
1468
1469         iph = ip_hdr(skb);
1470         th = tcp_hdr(skb);
1471
1472         if (th->doff < sizeof(struct tcphdr) / 4)
1473                 return;
1474
1475         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1476                                        iph->saddr, th->source,
1477                                        iph->daddr, ntohs(th->dest),
1478                                        skb->skb_iif);
1479         if (sk) {
1480                 skb->sk = sk;
1481                 skb->destructor = sock_edemux;
1482                 if (sk_fullsock(sk)) {
1483                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1484
1485                         if (dst)
1486                                 dst = dst_check(dst, 0);
1487                         if (dst &&
1488                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1489                                 skb_dst_set_noref(skb, dst);
1490                 }
1491         }
1492 }
1493
1494 /* Packet is added to VJ-style prequeue for processing in process
1495  * context, if a reader task is waiting. Apparently, this exciting
1496  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1497  * failed somewhere. Latency? Burstiness? Well, at least now we will
1498  * see, why it failed. 8)8)                               --ANK
1499  *
1500  */
1501 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1502 {
1503         struct tcp_sock *tp = tcp_sk(sk);
1504
1505         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1506                 return false;
1507
1508         if (skb->len <= tcp_hdrlen(skb) &&
1509             skb_queue_len(&tp->ucopy.prequeue) == 0)
1510                 return false;
1511
1512         /* Before escaping RCU protected region, we need to take care of skb
1513          * dst. Prequeue is only enabled for established sockets.
1514          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1515          * Instead of doing full sk_rx_dst validity here, let's perform
1516          * an optimistic check.
1517          */
1518         if (likely(sk->sk_rx_dst))
1519                 skb_dst_drop(skb);
1520         else
1521                 skb_dst_force_safe(skb);
1522
1523         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1524         tp->ucopy.memory += skb->truesize;
1525         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1526             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1527                 struct sk_buff *skb1;
1528
1529                 BUG_ON(sock_owned_by_user(sk));
1530                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1531                                 skb_queue_len(&tp->ucopy.prequeue));
1532
1533                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1534                         sk_backlog_rcv(sk, skb1);
1535
1536                 tp->ucopy.memory = 0;
1537         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1538                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1539                                            POLLIN | POLLRDNORM | POLLRDBAND);
1540                 if (!inet_csk_ack_scheduled(sk))
1541                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1542                                                   (3 * tcp_rto_min(sk)) / 4,
1543                                                   TCP_RTO_MAX);
1544         }
1545         return true;
1546 }
1547 EXPORT_SYMBOL(tcp_prequeue);
1548
1549 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1550 {
1551         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1552
1553         /* Only socket owner can try to collapse/prune rx queues
1554          * to reduce memory overhead, so add a little headroom here.
1555          * Few sockets backlog are possibly concurrently non empty.
1556          */
1557         limit += 64*1024;
1558
1559         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1560          * we can fix skb->truesize to its real value to avoid future drops.
1561          * This is valid because skb is not yet charged to the socket.
1562          * It has been noticed pure SACK packets were sometimes dropped
1563          * (if cooked by drivers without copybreak feature).
1564          */
1565         if (!skb->data_len)
1566                 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1567
1568         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1569                 bh_unlock_sock(sk);
1570                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1571                 return true;
1572         }
1573         return false;
1574 }
1575 EXPORT_SYMBOL(tcp_add_backlog);
1576
1577 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1578 {
1579         struct tcphdr *th = (struct tcphdr *)skb->data;
1580         unsigned int eaten = skb->len;
1581         int err;
1582
1583         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1584         if (!err) {
1585                 eaten -= skb->len;
1586                 TCP_SKB_CB(skb)->end_seq -= eaten;
1587         }
1588         return err;
1589 }
1590 EXPORT_SYMBOL(tcp_filter);
1591
1592 /*
1593  *      From tcp_input.c
1594  */
1595
1596 int tcp_v4_rcv(struct sk_buff *skb)
1597 {
1598         struct net *net = dev_net(skb->dev);
1599         const struct iphdr *iph;
1600         const struct tcphdr *th;
1601         bool refcounted;
1602         struct sock *sk;
1603         int ret;
1604
1605         if (skb->pkt_type != PACKET_HOST)
1606                 goto discard_it;
1607
1608         /* Count it even if it's bad */
1609         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1610
1611         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1612                 goto discard_it;
1613
1614         th = (const struct tcphdr *)skb->data;
1615
1616         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1617                 goto bad_packet;
1618         if (!pskb_may_pull(skb, th->doff * 4))
1619                 goto discard_it;
1620
1621         /* An explanation is required here, I think.
1622          * Packet length and doff are validated by header prediction,
1623          * provided case of th->doff==0 is eliminated.
1624          * So, we defer the checks. */
1625
1626         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1627                 goto csum_error;
1628
1629         th = (const struct tcphdr *)skb->data;
1630         iph = ip_hdr(skb);
1631         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1632          * barrier() makes sure compiler wont play fool^Waliasing games.
1633          */
1634         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1635                 sizeof(struct inet_skb_parm));
1636         barrier();
1637
1638         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1639         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1640                                     skb->len - th->doff * 4);
1641         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1642         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1643         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1644         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1645         TCP_SKB_CB(skb)->sacked  = 0;
1646
1647 lookup:
1648         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1649                                th->dest, &refcounted);
1650         if (!sk)
1651                 goto no_tcp_socket;
1652
1653 process:
1654         if (sk->sk_state == TCP_TIME_WAIT)
1655                 goto do_time_wait;
1656
1657         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1658                 struct request_sock *req = inet_reqsk(sk);
1659                 struct sock *nsk;
1660
1661                 sk = req->rsk_listener;
1662                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1663                         sk_drops_add(sk, skb);
1664                         reqsk_put(req);
1665                         goto discard_it;
1666                 }
1667                 if (tcp_checksum_complete(skb)) {
1668                         reqsk_put(req);
1669                         goto csum_error;
1670                 }
1671                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1672                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1673                         goto lookup;
1674                 }
1675                 /* We own a reference on the listener, increase it again
1676                  * as we might lose it too soon.
1677                  */
1678                 sock_hold(sk);
1679                 refcounted = true;
1680                 nsk = tcp_check_req(sk, skb, req, false);
1681                 if (!nsk) {
1682                         reqsk_put(req);
1683                         goto discard_and_relse;
1684                 }
1685                 if (nsk == sk) {
1686                         reqsk_put(req);
1687                 } else if (tcp_child_process(sk, nsk, skb)) {
1688                         tcp_v4_send_reset(nsk, skb);
1689                         goto discard_and_relse;
1690                 } else {
1691                         sock_put(sk);
1692                         return 0;
1693                 }
1694         }
1695         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1696                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1697                 goto discard_and_relse;
1698         }
1699
1700         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1701                 goto discard_and_relse;
1702
1703         if (tcp_v4_inbound_md5_hash(sk, skb))
1704                 goto discard_and_relse;
1705
1706         nf_reset(skb);
1707
1708         if (tcp_filter(sk, skb))
1709                 goto discard_and_relse;
1710         th = (const struct tcphdr *)skb->data;
1711         iph = ip_hdr(skb);
1712
1713         skb->dev = NULL;
1714
1715         if (sk->sk_state == TCP_LISTEN) {
1716                 ret = tcp_v4_do_rcv(sk, skb);
1717                 goto put_and_return;
1718         }
1719
1720         sk_incoming_cpu_update(sk);
1721
1722         bh_lock_sock_nested(sk);
1723         tcp_segs_in(tcp_sk(sk), skb);
1724         ret = 0;
1725         if (!sock_owned_by_user(sk)) {
1726                 if (!tcp_prequeue(sk, skb))
1727                         ret = tcp_v4_do_rcv(sk, skb);
1728         } else if (tcp_add_backlog(sk, skb)) {
1729                 goto discard_and_relse;
1730         }
1731         bh_unlock_sock(sk);
1732
1733 put_and_return:
1734         if (refcounted)
1735                 sock_put(sk);
1736
1737         return ret;
1738
1739 no_tcp_socket:
1740         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1741                 goto discard_it;
1742
1743         if (tcp_checksum_complete(skb)) {
1744 csum_error:
1745                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1746 bad_packet:
1747                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1748         } else {
1749                 tcp_v4_send_reset(NULL, skb);
1750         }
1751
1752 discard_it:
1753         /* Discard frame. */
1754         kfree_skb(skb);
1755         return 0;
1756
1757 discard_and_relse:
1758         sk_drops_add(sk, skb);
1759         if (refcounted)
1760                 sock_put(sk);
1761         goto discard_it;
1762
1763 do_time_wait:
1764         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1765                 inet_twsk_put(inet_twsk(sk));
1766                 goto discard_it;
1767         }
1768
1769         if (tcp_checksum_complete(skb)) {
1770                 inet_twsk_put(inet_twsk(sk));
1771                 goto csum_error;
1772         }
1773         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1774         case TCP_TW_SYN: {
1775                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1776                                                         &tcp_hashinfo, skb,
1777                                                         __tcp_hdrlen(th),
1778                                                         iph->saddr, th->source,
1779                                                         iph->daddr, th->dest,
1780                                                         inet_iif(skb));
1781                 if (sk2) {
1782                         inet_twsk_deschedule_put(inet_twsk(sk));
1783                         sk = sk2;
1784                         refcounted = false;
1785                         goto process;
1786                 }
1787                 /* Fall through to ACK */
1788         }
1789         case TCP_TW_ACK:
1790                 tcp_v4_timewait_ack(sk, skb);
1791                 break;
1792         case TCP_TW_RST:
1793                 tcp_v4_send_reset(sk, skb);
1794                 inet_twsk_deschedule_put(inet_twsk(sk));
1795                 goto discard_it;
1796         case TCP_TW_SUCCESS:;
1797         }
1798         goto discard_it;
1799 }
1800
1801 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1802         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1803         .twsk_unique    = tcp_twsk_unique,
1804         .twsk_destructor= tcp_twsk_destructor,
1805 };
1806
1807 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1808 {
1809         struct dst_entry *dst = skb_dst(skb);
1810
1811         if (dst && dst_hold_safe(dst)) {
1812                 sk->sk_rx_dst = dst;
1813                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1814         }
1815 }
1816 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1817
1818 const struct inet_connection_sock_af_ops ipv4_specific = {
1819         .queue_xmit        = ip_queue_xmit,
1820         .send_check        = tcp_v4_send_check,
1821         .rebuild_header    = inet_sk_rebuild_header,
1822         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1823         .conn_request      = tcp_v4_conn_request,
1824         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1825         .net_header_len    = sizeof(struct iphdr),
1826         .setsockopt        = ip_setsockopt,
1827         .getsockopt        = ip_getsockopt,
1828         .addr2sockaddr     = inet_csk_addr2sockaddr,
1829         .sockaddr_len      = sizeof(struct sockaddr_in),
1830         .bind_conflict     = inet_csk_bind_conflict,
1831 #ifdef CONFIG_COMPAT
1832         .compat_setsockopt = compat_ip_setsockopt,
1833         .compat_getsockopt = compat_ip_getsockopt,
1834 #endif
1835         .mtu_reduced       = tcp_v4_mtu_reduced,
1836 };
1837 EXPORT_SYMBOL(ipv4_specific);
1838
1839 #ifdef CONFIG_TCP_MD5SIG
1840 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1841         .md5_lookup             = tcp_v4_md5_lookup,
1842         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1843         .md5_parse              = tcp_v4_parse_md5_keys,
1844 };
1845 #endif
1846
1847 /* NOTE: A lot of things set to zero explicitly by call to
1848  *       sk_alloc() so need not be done here.
1849  */
1850 static int tcp_v4_init_sock(struct sock *sk)
1851 {
1852         struct inet_connection_sock *icsk = inet_csk(sk);
1853
1854         tcp_init_sock(sk);
1855
1856         icsk->icsk_af_ops = &ipv4_specific;
1857
1858 #ifdef CONFIG_TCP_MD5SIG
1859         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1860 #endif
1861
1862         return 0;
1863 }
1864
1865 void tcp_v4_destroy_sock(struct sock *sk)
1866 {
1867         struct tcp_sock *tp = tcp_sk(sk);
1868
1869         tcp_clear_xmit_timers(sk);
1870
1871         tcp_cleanup_congestion_control(sk);
1872
1873         /* Cleanup up the write buffer. */
1874         tcp_write_queue_purge(sk);
1875
1876         /* Cleans up our, hopefully empty, out_of_order_queue. */
1877         skb_rbtree_purge(&tp->out_of_order_queue);
1878
1879 #ifdef CONFIG_TCP_MD5SIG
1880         /* Clean up the MD5 key list, if any */
1881         if (tp->md5sig_info) {
1882                 tcp_clear_md5_list(sk);
1883                 kfree_rcu(tp->md5sig_info, rcu);
1884                 tp->md5sig_info = NULL;
1885         }
1886 #endif
1887
1888         /* Clean prequeue, it must be empty really */
1889         __skb_queue_purge(&tp->ucopy.prequeue);
1890
1891         /* Clean up a referenced TCP bind bucket. */
1892         if (inet_csk(sk)->icsk_bind_hash)
1893                 inet_put_port(sk);
1894
1895         BUG_ON(tp->fastopen_rsk);
1896
1897         /* If socket is aborted during connect operation */
1898         tcp_free_fastopen_req(tp);
1899         tcp_saved_syn_free(tp);
1900
1901         local_bh_disable();
1902         sk_sockets_allocated_dec(sk);
1903         local_bh_enable();
1904 }
1905 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1906
1907 #ifdef CONFIG_PROC_FS
1908 /* Proc filesystem TCP sock list dumping. */
1909
1910 /*
1911  * Get next listener socket follow cur.  If cur is NULL, get first socket
1912  * starting from bucket given in st->bucket; when st->bucket is zero the
1913  * very first socket in the hash table is returned.
1914  */
1915 static void *listening_get_next(struct seq_file *seq, void *cur)
1916 {
1917         struct tcp_iter_state *st = seq->private;
1918         struct net *net = seq_file_net(seq);
1919         struct inet_listen_hashbucket *ilb;
1920         struct sock *sk = cur;
1921
1922         if (!sk) {
1923 get_head:
1924                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1925                 spin_lock_bh(&ilb->lock);
1926                 sk = sk_head(&ilb->head);
1927                 st->offset = 0;
1928                 goto get_sk;
1929         }
1930         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1931         ++st->num;
1932         ++st->offset;
1933
1934         sk = sk_next(sk);
1935 get_sk:
1936         sk_for_each_from(sk) {
1937                 if (!net_eq(sock_net(sk), net))
1938                         continue;
1939                 if (sk->sk_family == st->family)
1940                         return sk;
1941         }
1942         spin_unlock_bh(&ilb->lock);
1943         st->offset = 0;
1944         if (++st->bucket < INET_LHTABLE_SIZE)
1945                 goto get_head;
1946         return NULL;
1947 }
1948
1949 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1950 {
1951         struct tcp_iter_state *st = seq->private;
1952         void *rc;
1953
1954         st->bucket = 0;
1955         st->offset = 0;
1956         rc = listening_get_next(seq, NULL);
1957
1958         while (rc && *pos) {
1959                 rc = listening_get_next(seq, rc);
1960                 --*pos;
1961         }
1962         return rc;
1963 }
1964
1965 static inline bool empty_bucket(const struct tcp_iter_state *st)
1966 {
1967         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1968 }
1969
1970 /*
1971  * Get first established socket starting from bucket given in st->bucket.
1972  * If st->bucket is zero, the very first socket in the hash is returned.
1973  */
1974 static void *established_get_first(struct seq_file *seq)
1975 {
1976         struct tcp_iter_state *st = seq->private;
1977         struct net *net = seq_file_net(seq);
1978         void *rc = NULL;
1979
1980         st->offset = 0;
1981         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1982                 struct sock *sk;
1983                 struct hlist_nulls_node *node;
1984                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1985
1986                 /* Lockless fast path for the common case of empty buckets */
1987                 if (empty_bucket(st))
1988                         continue;
1989
1990                 spin_lock_bh(lock);
1991                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1992                         if (sk->sk_family != st->family ||
1993                             !net_eq(sock_net(sk), net)) {
1994                                 continue;
1995                         }
1996                         rc = sk;
1997                         goto out;
1998                 }
1999                 spin_unlock_bh(lock);
2000         }
2001 out:
2002         return rc;
2003 }
2004
2005 static void *established_get_next(struct seq_file *seq, void *cur)
2006 {
2007         struct sock *sk = cur;
2008         struct hlist_nulls_node *node;
2009         struct tcp_iter_state *st = seq->private;
2010         struct net *net = seq_file_net(seq);
2011
2012         ++st->num;
2013         ++st->offset;
2014
2015         sk = sk_nulls_next(sk);
2016
2017         sk_nulls_for_each_from(sk, node) {
2018                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2019                         return sk;
2020         }
2021
2022         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2023         ++st->bucket;
2024         return established_get_first(seq);
2025 }
2026
2027 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2028 {
2029         struct tcp_iter_state *st = seq->private;
2030         void *rc;
2031
2032         st->bucket = 0;
2033         rc = established_get_first(seq);
2034
2035         while (rc && pos) {
2036                 rc = established_get_next(seq, rc);
2037                 --pos;
2038         }
2039         return rc;
2040 }
2041
2042 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2043 {
2044         void *rc;
2045         struct tcp_iter_state *st = seq->private;
2046
2047         st->state = TCP_SEQ_STATE_LISTENING;
2048         rc        = listening_get_idx(seq, &pos);
2049
2050         if (!rc) {
2051                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2052                 rc        = established_get_idx(seq, pos);
2053         }
2054
2055         return rc;
2056 }
2057
2058 static void *tcp_seek_last_pos(struct seq_file *seq)
2059 {
2060         struct tcp_iter_state *st = seq->private;
2061         int offset = st->offset;
2062         int orig_num = st->num;
2063         void *rc = NULL;
2064
2065         switch (st->state) {
2066         case TCP_SEQ_STATE_LISTENING:
2067                 if (st->bucket >= INET_LHTABLE_SIZE)
2068                         break;
2069                 st->state = TCP_SEQ_STATE_LISTENING;
2070                 rc = listening_get_next(seq, NULL);
2071                 while (offset-- && rc)
2072                         rc = listening_get_next(seq, rc);
2073                 if (rc)
2074                         break;
2075                 st->bucket = 0;
2076                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2077                 /* Fallthrough */
2078         case TCP_SEQ_STATE_ESTABLISHED:
2079                 if (st->bucket > tcp_hashinfo.ehash_mask)
2080                         break;
2081                 rc = established_get_first(seq);
2082                 while (offset-- && rc)
2083                         rc = established_get_next(seq, rc);
2084         }
2085
2086         st->num = orig_num;
2087
2088         return rc;
2089 }
2090
2091 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2092 {
2093         struct tcp_iter_state *st = seq->private;
2094         void *rc;
2095
2096         if (*pos && *pos == st->last_pos) {
2097                 rc = tcp_seek_last_pos(seq);
2098                 if (rc)
2099                         goto out;
2100         }
2101
2102         st->state = TCP_SEQ_STATE_LISTENING;
2103         st->num = 0;
2104         st->bucket = 0;
2105         st->offset = 0;
2106         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2107
2108 out:
2109         st->last_pos = *pos;
2110         return rc;
2111 }
2112
2113 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2114 {
2115         struct tcp_iter_state *st = seq->private;
2116         void *rc = NULL;
2117
2118         if (v == SEQ_START_TOKEN) {
2119                 rc = tcp_get_idx(seq, 0);
2120                 goto out;
2121         }
2122
2123         switch (st->state) {
2124         case TCP_SEQ_STATE_LISTENING:
2125                 rc = listening_get_next(seq, v);
2126                 if (!rc) {
2127                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2128                         st->bucket = 0;
2129                         st->offset = 0;
2130                         rc        = established_get_first(seq);
2131                 }
2132                 break;
2133         case TCP_SEQ_STATE_ESTABLISHED:
2134                 rc = established_get_next(seq, v);
2135                 break;
2136         }
2137 out:
2138         ++*pos;
2139         st->last_pos = *pos;
2140         return rc;
2141 }
2142
2143 static void tcp_seq_stop(struct seq_file *seq, void *v)
2144 {
2145         struct tcp_iter_state *st = seq->private;
2146
2147         switch (st->state) {
2148         case TCP_SEQ_STATE_LISTENING:
2149                 if (v != SEQ_START_TOKEN)
2150                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2151                 break;
2152         case TCP_SEQ_STATE_ESTABLISHED:
2153                 if (v)
2154                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2155                 break;
2156         }
2157 }
2158
2159 int tcp_seq_open(struct inode *inode, struct file *file)
2160 {
2161         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2162         struct tcp_iter_state *s;
2163         int err;
2164
2165         err = seq_open_net(inode, file, &afinfo->seq_ops,
2166                           sizeof(struct tcp_iter_state));
2167         if (err < 0)
2168                 return err;
2169
2170         s = ((struct seq_file *)file->private_data)->private;
2171         s->family               = afinfo->family;
2172         s->last_pos             = 0;
2173         return 0;
2174 }
2175 EXPORT_SYMBOL(tcp_seq_open);
2176
2177 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2178 {
2179         int rc = 0;
2180         struct proc_dir_entry *p;
2181
2182         afinfo->seq_ops.start           = tcp_seq_start;
2183         afinfo->seq_ops.next            = tcp_seq_next;
2184         afinfo->seq_ops.stop            = tcp_seq_stop;
2185
2186         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2187                              afinfo->seq_fops, afinfo);
2188         if (!p)
2189                 rc = -ENOMEM;
2190         return rc;
2191 }
2192 EXPORT_SYMBOL(tcp_proc_register);
2193
2194 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2195 {
2196         remove_proc_entry(afinfo->name, net->proc_net);
2197 }
2198 EXPORT_SYMBOL(tcp_proc_unregister);
2199
2200 static void get_openreq4(const struct request_sock *req,
2201                          struct seq_file *f, int i)
2202 {
2203         const struct inet_request_sock *ireq = inet_rsk(req);
2204         long delta = req->rsk_timer.expires - jiffies;
2205
2206         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2207                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2208                 i,
2209                 ireq->ir_loc_addr,
2210                 ireq->ir_num,
2211                 ireq->ir_rmt_addr,
2212                 ntohs(ireq->ir_rmt_port),
2213                 TCP_SYN_RECV,
2214                 0, 0, /* could print option size, but that is af dependent. */
2215                 1,    /* timers active (only the expire timer) */
2216                 jiffies_delta_to_clock_t(delta),
2217                 req->num_timeout,
2218                 from_kuid_munged(seq_user_ns(f),
2219                                  sock_i_uid(req->rsk_listener)),
2220                 0,  /* non standard timer */
2221                 0, /* open_requests have no inode */
2222                 0,
2223                 req);
2224 }
2225
2226 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2227 {
2228         int timer_active;
2229         unsigned long timer_expires;
2230         const struct tcp_sock *tp = tcp_sk(sk);
2231         const struct inet_connection_sock *icsk = inet_csk(sk);
2232         const struct inet_sock *inet = inet_sk(sk);
2233         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2234         __be32 dest = inet->inet_daddr;
2235         __be32 src = inet->inet_rcv_saddr;
2236         __u16 destp = ntohs(inet->inet_dport);
2237         __u16 srcp = ntohs(inet->inet_sport);
2238         int rx_queue;
2239         int state;
2240
2241         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2242             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2243             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2244                 timer_active    = 1;
2245                 timer_expires   = icsk->icsk_timeout;
2246         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2247                 timer_active    = 4;
2248                 timer_expires   = icsk->icsk_timeout;
2249         } else if (timer_pending(&sk->sk_timer)) {
2250                 timer_active    = 2;
2251                 timer_expires   = sk->sk_timer.expires;
2252         } else {
2253                 timer_active    = 0;
2254                 timer_expires = jiffies;
2255         }
2256
2257         state = sk_state_load(sk);
2258         if (state == TCP_LISTEN)
2259                 rx_queue = sk->sk_ack_backlog;
2260         else
2261                 /* Because we don't lock the socket,
2262                  * we might find a transient negative value.
2263                  */
2264                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2265
2266         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2267                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2268                 i, src, srcp, dest, destp, state,
2269                 tp->write_seq - tp->snd_una,
2270                 rx_queue,
2271                 timer_active,
2272                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2273                 icsk->icsk_retransmits,
2274                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2275                 icsk->icsk_probes_out,
2276                 sock_i_ino(sk),
2277                 atomic_read(&sk->sk_refcnt), sk,
2278                 jiffies_to_clock_t(icsk->icsk_rto),
2279                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2280                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2281                 tp->snd_cwnd,
2282                 state == TCP_LISTEN ?
2283                     fastopenq->max_qlen :
2284                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2285 }
2286
2287 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2288                                struct seq_file *f, int i)
2289 {
2290         long delta = tw->tw_timer.expires - jiffies;
2291         __be32 dest, src;
2292         __u16 destp, srcp;
2293
2294         dest  = tw->tw_daddr;
2295         src   = tw->tw_rcv_saddr;
2296         destp = ntohs(tw->tw_dport);
2297         srcp  = ntohs(tw->tw_sport);
2298
2299         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2300                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2301                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2302                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2303                 atomic_read(&tw->tw_refcnt), tw);
2304 }
2305
2306 #define TMPSZ 150
2307
2308 static int tcp4_seq_show(struct seq_file *seq, void *v)
2309 {
2310         struct tcp_iter_state *st;
2311         struct sock *sk = v;
2312
2313         seq_setwidth(seq, TMPSZ - 1);
2314         if (v == SEQ_START_TOKEN) {
2315                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2316                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2317                            "inode");
2318                 goto out;
2319         }
2320         st = seq->private;
2321
2322         if (sk->sk_state == TCP_TIME_WAIT)
2323                 get_timewait4_sock(v, seq, st->num);
2324         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2325                 get_openreq4(v, seq, st->num);
2326         else
2327                 get_tcp4_sock(v, seq, st->num);
2328 out:
2329         seq_pad(seq, '\n');
2330         return 0;
2331 }
2332
2333 static const struct file_operations tcp_afinfo_seq_fops = {
2334         .owner   = THIS_MODULE,
2335         .open    = tcp_seq_open,
2336         .read    = seq_read,
2337         .llseek  = seq_lseek,
2338         .release = seq_release_net
2339 };
2340
2341 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2342         .name           = "tcp",
2343         .family         = AF_INET,
2344         .seq_fops       = &tcp_afinfo_seq_fops,
2345         .seq_ops        = {
2346                 .show           = tcp4_seq_show,
2347         },
2348 };
2349
2350 static int __net_init tcp4_proc_init_net(struct net *net)
2351 {
2352         return tcp_proc_register(net, &tcp4_seq_afinfo);
2353 }
2354
2355 static void __net_exit tcp4_proc_exit_net(struct net *net)
2356 {
2357         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2358 }
2359
2360 static struct pernet_operations tcp4_net_ops = {
2361         .init = tcp4_proc_init_net,
2362         .exit = tcp4_proc_exit_net,
2363 };
2364
2365 int __init tcp4_proc_init(void)
2366 {
2367         return register_pernet_subsys(&tcp4_net_ops);
2368 }
2369
2370 void tcp4_proc_exit(void)
2371 {
2372         unregister_pernet_subsys(&tcp4_net_ops);
2373 }
2374 #endif /* CONFIG_PROC_FS */
2375
2376 struct proto tcp_prot = {
2377         .name                   = "TCP",
2378         .owner                  = THIS_MODULE,
2379         .close                  = tcp_close,
2380         .connect                = tcp_v4_connect,
2381         .disconnect             = tcp_disconnect,
2382         .accept                 = inet_csk_accept,
2383         .ioctl                  = tcp_ioctl,
2384         .init                   = tcp_v4_init_sock,
2385         .destroy                = tcp_v4_destroy_sock,
2386         .shutdown               = tcp_shutdown,
2387         .setsockopt             = tcp_setsockopt,
2388         .getsockopt             = tcp_getsockopt,
2389         .recvmsg                = tcp_recvmsg,
2390         .sendmsg                = tcp_sendmsg,
2391         .sendpage               = tcp_sendpage,
2392         .backlog_rcv            = tcp_v4_do_rcv,
2393         .release_cb             = tcp_release_cb,
2394         .hash                   = inet_hash,
2395         .unhash                 = inet_unhash,
2396         .get_port               = inet_csk_get_port,
2397         .enter_memory_pressure  = tcp_enter_memory_pressure,
2398         .stream_memory_free     = tcp_stream_memory_free,
2399         .sockets_allocated      = &tcp_sockets_allocated,
2400         .orphan_count           = &tcp_orphan_count,
2401         .memory_allocated       = &tcp_memory_allocated,
2402         .memory_pressure        = &tcp_memory_pressure,
2403         .sysctl_mem             = sysctl_tcp_mem,
2404         .sysctl_wmem            = sysctl_tcp_wmem,
2405         .sysctl_rmem            = sysctl_tcp_rmem,
2406         .max_header             = MAX_TCP_HEADER,
2407         .obj_size               = sizeof(struct tcp_sock),
2408         .slab_flags             = SLAB_DESTROY_BY_RCU,
2409         .twsk_prot              = &tcp_timewait_sock_ops,
2410         .rsk_prot               = &tcp_request_sock_ops,
2411         .h.hashinfo             = &tcp_hashinfo,
2412         .no_autobind            = true,
2413 #ifdef CONFIG_COMPAT
2414         .compat_setsockopt      = compat_tcp_setsockopt,
2415         .compat_getsockopt      = compat_tcp_getsockopt,
2416 #endif
2417         .diag_destroy           = tcp_abort,
2418 };
2419 EXPORT_SYMBOL(tcp_prot);
2420
2421 static void __net_exit tcp_sk_exit(struct net *net)
2422 {
2423         int cpu;
2424
2425         for_each_possible_cpu(cpu)
2426                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2427         free_percpu(net->ipv4.tcp_sk);
2428 }
2429
2430 static int __net_init tcp_sk_init(struct net *net)
2431 {
2432         int res, cpu;
2433
2434         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2435         if (!net->ipv4.tcp_sk)
2436                 return -ENOMEM;
2437
2438         for_each_possible_cpu(cpu) {
2439                 struct sock *sk;
2440
2441                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2442                                            IPPROTO_TCP, net);
2443                 if (res)
2444                         goto fail;
2445                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2446
2447                 /* Please enforce IP_DF and IPID==0 for RST and
2448                  * ACK sent in SYN-RECV and TIME-WAIT state.
2449                  */
2450                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2451
2452                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2453         }
2454
2455         net->ipv4.sysctl_tcp_ecn = 2;
2456         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2457
2458         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2459         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2460         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2461         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2462
2463         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2464         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2465         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2466
2467         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2468         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2469         net->ipv4.sysctl_tcp_syncookies = 1;
2470         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2471         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2472         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2473         net->ipv4.sysctl_tcp_orphan_retries = 0;
2474         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2475         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2476
2477         return 0;
2478 fail:
2479         tcp_sk_exit(net);
2480
2481         return res;
2482 }
2483
2484 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2485 {
2486         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2487 }
2488
2489 static struct pernet_operations __net_initdata tcp_sk_ops = {
2490        .init       = tcp_sk_init,
2491        .exit       = tcp_sk_exit,
2492        .exit_batch = tcp_sk_exit_batch,
2493 };
2494
2495 void __init tcp_v4_init(void)
2496 {
2497         inet_hashinfo_init(&tcp_hashinfo);
2498         if (register_pernet_subsys(&tcp_sk_ops))
2499                 panic("Failed to create the TCP control socket.\n");
2500 }