net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/tcp_memcontrol.h>
  77 #include <net/busy_poll.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92 #ifdef CONFIG_TCP_MD5SIG
  93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  95 #endif
  96
  97 struct inet_hashinfo tcp_hashinfo;
  98 EXPORT_SYMBOL(tcp_hashinfo);
  99
 100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 101 {
 102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 103                                           ip_hdr(skb)->saddr,
 104                                           tcp_hdr(skb)->dest,
 105                                           tcp_hdr(skb)->source);
 106 }
 107
 108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 109 {
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112
 113         /* With PAWS, it is safe from the viewpoint
 114            of data integrity. Even without PAWS it is safe provided sequence
 115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 116
 117            Actually, the idea is close to VJ's one, only timestamp cache is
 118            held not per host, but per port pair and TW bucket is used as state
 119            holder.
 120
 121            If TW bucket has been already destroyed we fall back to VJ's scheme
 122            and use initial timestamp retrieved from peer table.
 123          */
 124         if (tcptw->tw_ts_recent_stamp &&
 125             (!twp || (sysctl_tcp_tw_reuse &&
 126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 128                 if (tp->write_seq == 0)
 129                         tp->write_seq = 1;
 130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 132                 sock_hold(sktw);
 133                 return 1;
 134         }
 135
 136         return 0;
 137 }
 138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 139
 140 /* This will initiate an outgoing connection. */
 141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 142 {
 143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 144         struct inet_sock *inet = inet_sk(sk);
 145         struct tcp_sock *tp = tcp_sk(sk);
 146         __be16 orig_sport, orig_dport;
 147         __be32 daddr, nexthop;
 148         struct flowi4 *fl4;
 149         struct rtable *rt;
 150         int err;
 151         struct ip_options_rcu *inet_opt;
 152
 153         if (addr_len < sizeof(struct sockaddr_in))
 154                 return -EINVAL;
 155
 156         if (usin->sin_family != AF_INET)
 157                 return -EAFNOSUPPORT;
 158
 159         nexthop = daddr = usin->sin_addr.s_addr;
 160         inet_opt = rcu_dereference_protected(inet->inet_opt,
 161                                              sock_owned_by_user(sk));
 162         if (inet_opt && inet_opt->opt.srr) {
 163                 if (!daddr)
 164                         return -EINVAL;
 165                 nexthop = inet_opt->opt.faddr;
 166         }
 167
 168         orig_sport = inet->inet_sport;
 169         orig_dport = usin->sin_port;
 170         fl4 = &inet->cork.fl.u.ip4;
 171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 173                               IPPROTO_TCP,
 174                               orig_sport, orig_dport, sk);
 175         if (IS_ERR(rt)) {
 176                 err = PTR_ERR(rt);
 177                 if (err == -ENETUNREACH)
 178                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 179                 return err;
 180         }
 181
 182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 183                 ip_rt_put(rt);
 184                 return -ENETUNREACH;
 185         }
 186
 187         if (!inet_opt || !inet_opt->opt.srr)
 188                 daddr = fl4->daddr;
 189
 190         if (!inet->inet_saddr)
 191                 inet->inet_saddr = fl4->saddr;
 192         sk_rcv_saddr_set(sk, inet->inet_saddr);
 193
 194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 195                 /* Reset inherited state */
 196                 tp->rx_opt.ts_recent       = 0;
 197                 tp->rx_opt.ts_recent_stamp = 0;
 198                 if (likely(!tp->repair))
 199                         tp->write_seq      = 0;
 200         }
 201
 202         if (tcp_death_row.sysctl_tw_recycle &&
 203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 204                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 205
 206         inet->inet_dport = usin->sin_port;
 207         sk_daddr_set(sk, daddr);
 208
 209         inet_csk(sk)->icsk_ext_hdr_len = 0;
 210         if (inet_opt)
 211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215         /* Socket identity is still unknown (sport may be zero).
 216          * However we set state to SYN-SENT and not releasing socket
 217          * lock select source port, enter ourselves into the hash tables and
 218          * complete initialization after this.
 219          */
 220         tcp_set_state(sk, TCP_SYN_SENT);
 221         err = inet_hash_connect(&tcp_death_row, sk);
 222         if (err)
 223                 goto failure;
 224
 225         sk_set_txhash(sk);
 226
 227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                                inet->inet_sport, inet->inet_dport, sk);
 229         if (IS_ERR(rt)) {
 230                 err = PTR_ERR(rt);
 231                 rt = NULL;
 232                 goto failure;
 233         }
 234         /* OK, now commit destination to socket.  */
 235         sk->sk_gso_type = SKB_GSO_TCPV4;
 236         sk_setup_caps(sk, &rt->dst);
 237
 238         if (!tp->write_seq && likely(!tp->repair))
 239                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 240                                                            inet->inet_daddr,
 241                                                            inet->inet_sport,
 242                                                            usin->sin_port);
 243
 244         inet->inet_id = tp->write_seq ^ jiffies;
 245
 246         err = tcp_connect(sk);
 247
 248         rt = NULL;
 249         if (err)
 250                 goto failure;
 251
 252         return 0;
 253
 254 failure:
 255         /*
 256          * This unhashes the socket and releases the local port,
 257          * if necessary.
 258          */
 259         tcp_set_state(sk, TCP_CLOSE);
 260         ip_rt_put(rt);
 261         sk->sk_route_caps = 0;
 262         inet->inet_dport = 0;
 263         return err;
 264 }
 265 EXPORT_SYMBOL(tcp_v4_connect);
 266
 267 /*
 268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 269  * It can be called through tcp_release_cb() if socket was owned by user
 270  * at the time tcp_v4_err() was called to handle ICMP message.
 271  */
 272 void tcp_v4_mtu_reduced(struct sock *sk)
 273 {
 274         struct inet_sock *inet = inet_sk(sk);
 275         struct dst_entry *dst;
 276         u32 mtu;
 277
 278         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 279                 return;
 280         mtu = tcp_sk(sk)->mtu_info;
 281         dst = inet_csk_update_pmtu(sk, mtu);
 282         if (!dst)
 283                 return;
 284
 285         /* Something is about to be wrong... Remember soft error
 286          * for the case, if this connection will not able to recover.
 287          */
 288         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 289                 sk->sk_err_soft = EMSGSIZE;
 290
 291         mtu = dst_mtu(dst);
 292
 293         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 294             ip_sk_accept_pmtu(sk) &&
 295             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 296                 tcp_sync_mss(sk, mtu);
 297
 298                 /* Resend the TCP packet because it's
 299                  * clear that the old packet has been
 300                  * dropped. This is the new "fast" path mtu
 301                  * discovery.
 302                  */
 303                 tcp_simple_retransmit(sk);
 304         } /* else let the usual retransmit timer handle it */
 305 }
 306 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 307
 308 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 309 {
 310         struct dst_entry *dst = __sk_dst_check(sk, 0);
 311
 312         if (dst)
 313                 dst->ops->redirect(dst, sk, skb);
 314 }
 315
 316
 317 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 318 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 319 {
 320         struct request_sock *req = inet_reqsk(sk);
 321         struct net *net = sock_net(sk);
 322
 323         /* ICMPs are not backlogged, hence we cannot get
 324          * an established socket here.
 325          */
 326         if (seq != tcp_rsk(req)->snt_isn) {
 327                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 328         } else if (abort) {
 329                 /*
 330                  * Still in SYN_RECV, just remove it silently.
 331                  * There is no good way to pass the error to the newly
 332                  * created socket, and POSIX does not want network
 333                  * errors returned from accept().
 334                  */
 335                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 336                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
 337         }
 338         reqsk_put(req);
 339 }
 340 EXPORT_SYMBOL(tcp_req_err);
 341
 342 /*
 343  * This routine is called by the ICMP module when it gets some
 344  * sort of error condition.  If err < 0 then the socket should
 345  * be closed and the error returned to the user.  If err > 0
 346  * it's just the icmp type << 8 | icmp code.  After adjustment
 347  * header points to the first 8 bytes of the tcp header.  We need
 348  * to find the appropriate port.
 349  *
 350  * The locking strategy used here is very "optimistic". When
 351  * someone else accesses the socket the ICMP is just dropped
 352  * and for some paths there is no check at all.
 353  * A more general error queue to queue errors for later handling
 354  * is probably better.
 355  *
 356  */
 357
 358 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 359 {
 360         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 361         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 362         struct inet_connection_sock *icsk;
 363         struct tcp_sock *tp;
 364         struct inet_sock *inet;
 365         const int type = icmp_hdr(icmp_skb)->type;
 366         const int code = icmp_hdr(icmp_skb)->code;
 367         struct sock *sk;
 368         struct sk_buff *skb;
 369         struct request_sock *fastopen;
 370         __u32 seq, snd_una;
 371         __u32 remaining;
 372         int err;
 373         struct net *net = dev_net(icmp_skb->dev);
 374
 375         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 376                                        th->dest, iph->saddr, ntohs(th->source),
 377                                        inet_iif(icmp_skb));
 378         if (!sk) {
 379                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 380                 return;
 381         }
 382         if (sk->sk_state == TCP_TIME_WAIT) {
 383                 inet_twsk_put(inet_twsk(sk));
 384                 return;
 385         }
 386         seq = ntohl(th->seq);
 387         if (sk->sk_state == TCP_NEW_SYN_RECV)
 388                 return tcp_req_err(sk, seq,
 389                                   type == ICMP_PARAMETERPROB ||
 390                                   type == ICMP_TIME_EXCEEDED ||
 391                                   (type == ICMP_DEST_UNREACH &&
 392                                    (code == ICMP_NET_UNREACH ||
 393                                     code == ICMP_HOST_UNREACH)));
 394
 395         bh_lock_sock(sk);
 396         /* If too many ICMPs get dropped on busy
 397          * servers this needs to be solved differently.
 398          * We do take care of PMTU discovery (RFC1191) special case :
 399          * we can receive locally generated ICMP messages while socket is held.
 400          */
 401         if (sock_owned_by_user(sk)) {
 402                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 403                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 404         }
 405         if (sk->sk_state == TCP_CLOSE)
 406                 goto out;
 407
 408         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 409                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 410                 goto out;
 411         }
 412
 413         icsk = inet_csk(sk);
 414         tp = tcp_sk(sk);
 415         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 416         fastopen = tp->fastopen_rsk;
 417         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 418         if (sk->sk_state != TCP_LISTEN &&
 419             !between(seq, snd_una, tp->snd_nxt)) {
 420                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 421                 goto out;
 422         }
 423
 424         switch (type) {
 425         case ICMP_REDIRECT:
 426                 if (!sock_owned_by_user(sk))
 427                         do_redirect(icmp_skb, sk);
 428                 goto out;
 429         case ICMP_SOURCE_QUENCH:
 430                 /* Just silently ignore these. */
 431                 goto out;
 432         case ICMP_PARAMETERPROB:
 433                 err = EPROTO;
 434                 break;
 435         case ICMP_DEST_UNREACH:
 436                 if (code > NR_ICMP_UNREACH)
 437                         goto out;
 438
 439                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 440                         /* We are not interested in TCP_LISTEN and open_requests
 441                          * (SYN-ACKs send out by Linux are always <576bytes so
 442                          * they should go through unfragmented).
 443                          */
 444                         if (sk->sk_state == TCP_LISTEN)
 445                                 goto out;
 446
 447                         tp->mtu_info = info;
 448                         if (!sock_owned_by_user(sk)) {
 449                                 tcp_v4_mtu_reduced(sk);
 450                         } else {
 451                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 452                                         sock_hold(sk);
 453                         }
 454                         goto out;
 455                 }
 456
 457                 err = icmp_err_convert[code].errno;
 458                 /* check if icmp_skb allows revert of backoff
 459                  * (see draft-zimmermann-tcp-lcd) */
 460                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 461                         break;
 462                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 463                     !icsk->icsk_backoff || fastopen)
 464                         break;
 465
 466                 if (sock_owned_by_user(sk))
 467                         break;
 468
 469                 icsk->icsk_backoff--;
 470                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 471                                                TCP_TIMEOUT_INIT;
 472                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 473
 474                 skb = tcp_write_queue_head(sk);
 475                 BUG_ON(!skb);
 476
 477                 remaining = icsk->icsk_rto -
 478                             min(icsk->icsk_rto,
 479                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 480
 481                 if (remaining) {
 482                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 483                                                   remaining, TCP_RTO_MAX);
 484                 } else {
 485                         /* RTO revert clocked out retransmission.
 486                          * Will retransmit now */
 487                         tcp_retransmit_timer(sk);
 488                 }
 489
 490                 break;
 491         case ICMP_TIME_EXCEEDED:
 492                 err = EHOSTUNREACH;
 493                 break;
 494         default:
 495                 goto out;
 496         }
 497
 498         switch (sk->sk_state) {
 499         case TCP_SYN_SENT:
 500         case TCP_SYN_RECV:
 501                 /* Only in fast or simultaneous open. If a fast open socket is
 502                  * is already accepted it is treated as a connected one below.
 503                  */
 504                 if (fastopen && !fastopen->sk)
 505                         break;
 506
 507                 if (!sock_owned_by_user(sk)) {
 508                         sk->sk_err = err;
 509
 510                         sk->sk_error_report(sk);
 511
 512                         tcp_done(sk);
 513                 } else {
 514                         sk->sk_err_soft = err;
 515                 }
 516                 goto out;
 517         }
 518
 519         /* If we've already connected we will keep trying
 520          * until we time out, or the user gives up.
 521          *
 522          * rfc1122 4.2.3.9 allows to consider as hard errors
 523          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 524          * but it is obsoleted by pmtu discovery).
 525          *
 526          * Note, that in modern internet, where routing is unreliable
 527          * and in each dark corner broken firewalls sit, sending random
 528          * errors ordered by their masters even this two messages finally lose
 529          * their original sense (even Linux sends invalid PORT_UNREACHs)
 530          *
 531          * Now we are in compliance with RFCs.
 532          *                                                      --ANK (980905)
 533          */
 534
 535         inet = inet_sk(sk);
 536         if (!sock_owned_by_user(sk) && inet->recverr) {
 537                 sk->sk_err = err;
 538                 sk->sk_error_report(sk);
 539         } else  { /* Only an error on timeout */
 540                 sk->sk_err_soft = err;
 541         }
 542
 543 out:
 544         bh_unlock_sock(sk);
 545         sock_put(sk);
 546 }
 547
 548 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 549 {
 550         struct tcphdr *th = tcp_hdr(skb);
 551
 552         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 553                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 554                 skb->csum_start = skb_transport_header(skb) - skb->head;
 555                 skb->csum_offset = offsetof(struct tcphdr, check);
 556         } else {
 557                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 558                                          csum_partial(th,
 559                                                       th->doff << 2,
 560                                                       skb->csum));
 561         }
 562 }
 563
 564 /* This routine computes an IPv4 TCP checksum. */
 565 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 566 {
 567         const struct inet_sock *inet = inet_sk(sk);
 568
 569         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 570 }
 571 EXPORT_SYMBOL(tcp_v4_send_check);
 572
 573 /*
 574  *      This routine will send an RST to the other tcp.
 575  *
 576  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 577  *                    for reset.
 578  *      Answer: if a packet caused RST, it is not for a socket
 579  *              existing in our system, if it is matched to a socket,
 580  *              it is just duplicate segment or bug in other side's TCP.
 581  *              So that we build reply only basing on parameters
 582  *              arrived with segment.
 583  *      Exception: precedence violation. We do not implement it in any case.
 584  */
 585
 586 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 587 {
 588         const struct tcphdr *th = tcp_hdr(skb);
 589         struct {
 590                 struct tcphdr th;
 591 #ifdef CONFIG_TCP_MD5SIG
 592                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 593 #endif
 594         } rep;
 595         struct ip_reply_arg arg;
 596 #ifdef CONFIG_TCP_MD5SIG
 597         struct tcp_md5sig_key *key;
 598         const __u8 *hash_location = NULL;
 599         unsigned char newhash[16];
 600         int genhash;
 601         struct sock *sk1 = NULL;
 602 #endif
 603         struct net *net;
 604
 605         /* Never send a reset in response to a reset. */
 606         if (th->rst)
 607                 return;
 608
 609         /* If sk not NULL, it means we did a successful lookup and incoming
 610          * route had to be correct. prequeue might have dropped our dst.
 611          */
 612         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 613                 return;
 614
 615         /* Swap the send and the receive. */
 616         memset(&rep, 0, sizeof(rep));
 617         rep.th.dest   = th->source;
 618         rep.th.source = th->dest;
 619         rep.th.doff   = sizeof(struct tcphdr) / 4;
 620         rep.th.rst    = 1;
 621
 622         if (th->ack) {
 623                 rep.th.seq = th->ack_seq;
 624         } else {
 625                 rep.th.ack = 1;
 626                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 627                                        skb->len - (th->doff << 2));
 628         }
 629
 630         memset(&arg, 0, sizeof(arg));
 631         arg.iov[0].iov_base = (unsigned char *)&rep;
 632         arg.iov[0].iov_len  = sizeof(rep.th);
 633
 634         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 635 #ifdef CONFIG_TCP_MD5SIG
 636         hash_location = tcp_parse_md5sig_option(th);
 637         if (!sk && hash_location) {
 638                 /*
 639                  * active side is lost. Try to find listening socket through
 640                  * source port, and then find md5 key through listening socket.
 641                  * we are not loose security here:
 642                  * Incoming packet is checked with md5 hash with finding key,
 643                  * no RST generated if md5 hash doesn't match.
 644                  */
 645                 sk1 = __inet_lookup_listener(net,
 646                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
 647                                              th->source, ip_hdr(skb)->daddr,
 648                                              ntohs(th->source), inet_iif(skb));
 649                 /* don't send rst if it can't find key */
 650                 if (!sk1)
 651                         return;
 652                 rcu_read_lock();
 653                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 654                                         &ip_hdr(skb)->saddr, AF_INET);
 655                 if (!key)
 656                         goto release_sk1;
 657
 658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 660                         goto release_sk1;
 661         } else {
 662                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 663                                              &ip_hdr(skb)->saddr,
 664                                              AF_INET) : NULL;
 665         }
 666
 667         if (key) {
 668                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 669                                    (TCPOPT_NOP << 16) |
 670                                    (TCPOPT_MD5SIG << 8) |
 671                                    TCPOLEN_MD5SIG);
 672                 /* Update length and the length the header thinks exists */
 673                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 674                 rep.th.doff = arg.iov[0].iov_len / 4;
 675
 676                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 677                                      key, ip_hdr(skb)->saddr,
 678                                      ip_hdr(skb)->daddr, &rep.th);
 679         }
 680 #endif
 681         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 682                                       ip_hdr(skb)->saddr, /* XXX */
 683                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 684         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 685         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 686         /* When socket is gone, all binding information is lost.
 687          * routing might fail in this case. No choice here, if we choose to force
 688          * input interface, we will misroute in case of asymmetric route.
 689          */
 690         if (sk)
 691                 arg.bound_dev_if = sk->sk_bound_dev_if;
 692
 693         arg.tos = ip_hdr(skb)->tos;
 694         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 695         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 696                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 697                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 698                               &arg, arg.iov[0].iov_len);
 699
 700         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 701         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 702
 703 #ifdef CONFIG_TCP_MD5SIG
 704 release_sk1:
 705         if (sk1) {
 706                 rcu_read_unlock();
 707                 sock_put(sk1);
 708         }
 709 #endif
 710 }
 711
 712 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 713    outside socket context is ugly, certainly. What can I do?
 714  */
 715
 716 static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb,
 717                             u32 seq, u32 ack,
 718                             u32 win, u32 tsval, u32 tsecr, int oif,
 719                             struct tcp_md5sig_key *key,
 720                             int reply_flags, u8 tos)
 721 {
 722         const struct tcphdr *th = tcp_hdr(skb);
 723         struct {
 724                 struct tcphdr th;
 725                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 726 #ifdef CONFIG_TCP_MD5SIG
 727                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 728 #endif
 729                         ];
 730         } rep;
 731         struct ip_reply_arg arg;
 732         struct net *net = sock_net(sk);
 733
 734         memset(&rep.th, 0, sizeof(struct tcphdr));
 735         memset(&arg, 0, sizeof(arg));
 736
 737         arg.iov[0].iov_base = (unsigned char *)&rep;
 738         arg.iov[0].iov_len  = sizeof(rep.th);
 739         if (tsecr) {
 740                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 741                                    (TCPOPT_TIMESTAMP << 8) |
 742                                    TCPOLEN_TIMESTAMP);
 743                 rep.opt[1] = htonl(tsval);
 744                 rep.opt[2] = htonl(tsecr);
 745                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 746         }
 747
 748         /* Swap the send and the receive. */
 749         rep.th.dest    = th->source;
 750         rep.th.source  = th->dest;
 751         rep.th.doff    = arg.iov[0].iov_len / 4;
 752         rep.th.seq     = htonl(seq);
 753         rep.th.ack_seq = htonl(ack);
 754         rep.th.ack     = 1;
 755         rep.th.window  = htons(win);
 756
 757 #ifdef CONFIG_TCP_MD5SIG
 758         if (key) {
 759                 int offset = (tsecr) ? 3 : 0;
 760
 761                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 762                                           (TCPOPT_NOP << 16) |
 763                                           (TCPOPT_MD5SIG << 8) |
 764                                           TCPOLEN_MD5SIG);
 765                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 766                 rep.th.doff = arg.iov[0].iov_len/4;
 767
 768                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 769                                     key, ip_hdr(skb)->saddr,
 770                                     ip_hdr(skb)->daddr, &rep.th);
 771         }
 772 #endif
 773         arg.flags = reply_flags;
 774         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 775                                       ip_hdr(skb)->saddr, /* XXX */
 776                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 777         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 778         if (oif)
 779                 arg.bound_dev_if = oif;
 780         arg.tos = tos;
 781         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 782         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 783                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 784                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 785                               &arg, arg.iov[0].iov_len);
 786
 787         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 788 }
 789
 790 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 791 {
 792         struct inet_timewait_sock *tw = inet_twsk(sk);
 793         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 794
 795         tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 796                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 797                         tcp_time_stamp + tcptw->tw_ts_offset,
 798                         tcptw->tw_ts_recent,
 799                         tw->tw_bound_dev_if,
 800                         tcp_twsk_md5_key(tcptw),
 801                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 802                         tw->tw_tos
 803                         );
 804
 805         inet_twsk_put(tw);
 806 }
 807
 808 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 809                                   struct request_sock *req)
 810 {
 811         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 812          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 813          */
 814         tcp_v4_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
 815                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 816                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
 817                         tcp_time_stamp,
 818                         req->ts_recent,
 819                         0,
 820                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 821                                           AF_INET),
 822                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 823                         ip_hdr(skb)->tos);
 824 }
 825
 826 /*
 827  *      Send a SYN-ACK after having received a SYN.
 828  *      This still operates on a request_sock only, not on a big
 829  *      socket.
 830  */
 831 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 832                               struct flowi *fl,
 833                               struct request_sock *req,
 834                               struct tcp_fastopen_cookie *foc,
 835                                   bool attach_req)
 836 {
 837         const struct inet_request_sock *ireq = inet_rsk(req);
 838         struct flowi4 fl4;
 839         int err = -1;
 840         struct sk_buff *skb;
 841
 842         /* First, grab a route. */
 843         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 844                 return -1;
 845
 846         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
 847
 848         if (skb) {
 849                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 850
 851                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 852                                             ireq->ir_rmt_addr,
 853                                             ireq_opt_deref(ireq));
 854                 err = net_xmit_eval(err);
 855         }
 856
 857         return err;
 858 }
 859
 860 /*
 861  *      IPv4 request_sock destructor.
 862  */
 863 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 864 {
 865         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 866 }
 867
 868
 869 #ifdef CONFIG_TCP_MD5SIG
 870 /*
 871  * RFC2385 MD5 checksumming requires a mapping of
 872  * IP address->MD5 Key.
 873  * We need to maintain these in the sk structure.
 874  */
 875
 876 /* Find the Key structure for an address.  */
 877 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 878                                          const union tcp_md5_addr *addr,
 879                                          int family)
 880 {
 881         const struct tcp_sock *tp = tcp_sk(sk);
 882         struct tcp_md5sig_key *key;
 883         unsigned int size = sizeof(struct in_addr);
 884         const struct tcp_md5sig_info *md5sig;
 885
 886         /* caller either holds rcu_read_lock() or socket lock */
 887         md5sig = rcu_dereference_check(tp->md5sig_info,
 888                                        sock_owned_by_user(sk) ||
 889                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
 890         if (!md5sig)
 891                 return NULL;
 892 #if IS_ENABLED(CONFIG_IPV6)
 893         if (family == AF_INET6)
 894                 size = sizeof(struct in6_addr);
 895 #endif
 896         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 897                 if (key->family != family)
 898                         continue;
 899                 if (!memcmp(&key->addr, addr, size))
 900                         return key;
 901         }
 902         return NULL;
 903 }
 904 EXPORT_SYMBOL(tcp_md5_do_lookup);
 905
 906 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 907                                          const struct sock *addr_sk)
 908 {
 909         const union tcp_md5_addr *addr;
 910
 911         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 912         return tcp_md5_do_lookup(sk, addr, AF_INET);
 913 }
 914 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 915
 916 /* This can be called on a newly created socket, from other files */
 917 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 918                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 919 {
 920         /* Add Key to the list */
 921         struct tcp_md5sig_key *key;
 922         struct tcp_sock *tp = tcp_sk(sk);
 923         struct tcp_md5sig_info *md5sig;
 924
 925         key = tcp_md5_do_lookup(sk, addr, family);
 926         if (key) {
 927                 /* Pre-existing entry - just update that one. */
 928                 memcpy(key->key, newkey, newkeylen);
 929                 key->keylen = newkeylen;
 930                 return 0;
 931         }
 932
 933         md5sig = rcu_dereference_protected(tp->md5sig_info,
 934                                            sock_owned_by_user(sk) ||
 935                                            lockdep_is_held(&sk->sk_lock.slock));
 936         if (!md5sig) {
 937                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 938                 if (!md5sig)
 939                         return -ENOMEM;
 940
 941                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 942                 INIT_HLIST_HEAD(&md5sig->head);
 943                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 944         }
 945
 946         key = sock_kmalloc(sk, sizeof(*key), gfp);
 947         if (!key)
 948                 return -ENOMEM;
 949         if (!tcp_alloc_md5sig_pool()) {
 950                 sock_kfree_s(sk, key, sizeof(*key));
 951                 return -ENOMEM;
 952         }
 953
 954         memcpy(key->key, newkey, newkeylen);
 955         key->keylen = newkeylen;
 956         key->family = family;
 957         memcpy(&key->addr, addr,
 958                (family == AF_INET6) ? sizeof(struct in6_addr) :
 959                                       sizeof(struct in_addr));
 960         hlist_add_head_rcu(&key->node, &md5sig->head);
 961         return 0;
 962 }
 963 EXPORT_SYMBOL(tcp_md5_do_add);
 964
 965 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 966 {
 967         struct tcp_md5sig_key *key;
 968
 969         key = tcp_md5_do_lookup(sk, addr, family);
 970         if (!key)
 971                 return -ENOENT;
 972         hlist_del_rcu(&key->node);
 973         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 974         kfree_rcu(key, rcu);
 975         return 0;
 976 }
 977 EXPORT_SYMBOL(tcp_md5_do_del);
 978
 979 static void tcp_clear_md5_list(struct sock *sk)
 980 {
 981         struct tcp_sock *tp = tcp_sk(sk);
 982         struct tcp_md5sig_key *key;
 983         struct hlist_node *n;
 984         struct tcp_md5sig_info *md5sig;
 985
 986         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 987
 988         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 989                 hlist_del_rcu(&key->node);
 990                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 991                 kfree_rcu(key, rcu);
 992         }
 993 }
 994
 995 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 996                                  int optlen)
 997 {
 998         struct tcp_md5sig cmd;
 999         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1000
1001         if (optlen < sizeof(cmd))
1002                 return -EINVAL;
1003
1004         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1005                 return -EFAULT;
1006
1007         if (sin->sin_family != AF_INET)
1008                 return -EINVAL;
1009
1010         if (!cmd.tcpm_keylen)
1011                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1012                                       AF_INET);
1013
1014         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1015                 return -EINVAL;
1016
1017         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1018                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1019                               GFP_KERNEL);
1020 }
1021
1022 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1023                                         __be32 daddr, __be32 saddr, int nbytes)
1024 {
1025         struct tcp4_pseudohdr *bp;
1026         struct scatterlist sg;
1027
1028         bp = &hp->md5_blk.ip4;
1029
1030         /*
1031          * 1. the TCP pseudo-header (in the order: source IP address,
1032          * destination IP address, zero-padded protocol number, and
1033          * segment length)
1034          */
1035         bp->saddr = saddr;
1036         bp->daddr = daddr;
1037         bp->pad = 0;
1038         bp->protocol = IPPROTO_TCP;
1039         bp->len = cpu_to_be16(nbytes);
1040
1041         sg_init_one(&sg, bp, sizeof(*bp));
1042         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1043 }
1044
1045 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1046                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1047 {
1048         struct tcp_md5sig_pool *hp;
1049         struct hash_desc *desc;
1050
1051         hp = tcp_get_md5sig_pool();
1052         if (!hp)
1053                 goto clear_hash_noput;
1054         desc = &hp->md5_desc;
1055
1056         if (crypto_hash_init(desc))
1057                 goto clear_hash;
1058         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1059                 goto clear_hash;
1060         if (tcp_md5_hash_header(hp, th))
1061                 goto clear_hash;
1062         if (tcp_md5_hash_key(hp, key))
1063                 goto clear_hash;
1064         if (crypto_hash_final(desc, md5_hash))
1065                 goto clear_hash;
1066
1067         tcp_put_md5sig_pool();
1068         return 0;
1069
1070 clear_hash:
1071         tcp_put_md5sig_pool();
1072 clear_hash_noput:
1073         memset(md5_hash, 0, 16);
1074         return 1;
1075 }
1076
1077 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1078                         const struct sock *sk,
1079                         const struct sk_buff *skb)
1080 {
1081         struct tcp_md5sig_pool *hp;
1082         struct hash_desc *desc;
1083         const struct tcphdr *th = tcp_hdr(skb);
1084         __be32 saddr, daddr;
1085
1086         if (sk) { /* valid for establish/request sockets */
1087                 saddr = sk->sk_rcv_saddr;
1088                 daddr = sk->sk_daddr;
1089         } else {
1090                 const struct iphdr *iph = ip_hdr(skb);
1091                 saddr = iph->saddr;
1092                 daddr = iph->daddr;
1093         }
1094
1095         hp = tcp_get_md5sig_pool();
1096         if (!hp)
1097                 goto clear_hash_noput;
1098         desc = &hp->md5_desc;
1099
1100         if (crypto_hash_init(desc))
1101                 goto clear_hash;
1102
1103         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1104                 goto clear_hash;
1105         if (tcp_md5_hash_header(hp, th))
1106                 goto clear_hash;
1107         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1108                 goto clear_hash;
1109         if (tcp_md5_hash_key(hp, key))
1110                 goto clear_hash;
1111         if (crypto_hash_final(desc, md5_hash))
1112                 goto clear_hash;
1113
1114         tcp_put_md5sig_pool();
1115         return 0;
1116
1117 clear_hash:
1118         tcp_put_md5sig_pool();
1119 clear_hash_noput:
1120         memset(md5_hash, 0, 16);
1121         return 1;
1122 }
1123 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1124
1125 #endif
1126
1127 /* Called with rcu_read_lock() */
1128 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1129                                     const struct sk_buff *skb)
1130 {
1131 #ifdef CONFIG_TCP_MD5SIG
1132         /*
1133          * This gets called for each TCP segment that arrives
1134          * so we want to be efficient.
1135          * We have 3 drop cases:
1136          * o No MD5 hash and one expected.
1137          * o MD5 hash and we're not expecting one.
1138          * o MD5 hash and its wrong.
1139          */
1140         const __u8 *hash_location = NULL;
1141         struct tcp_md5sig_key *hash_expected;
1142         const struct iphdr *iph = ip_hdr(skb);
1143         const struct tcphdr *th = tcp_hdr(skb);
1144         int genhash;
1145         unsigned char newhash[16];
1146
1147         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1148                                           AF_INET);
1149         hash_location = tcp_parse_md5sig_option(th);
1150
1151         /* We've parsed the options - do we have a hash? */
1152         if (!hash_expected && !hash_location)
1153                 return false;
1154
1155         if (hash_expected && !hash_location) {
1156                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1157                 return true;
1158         }
1159
1160         if (!hash_expected && hash_location) {
1161                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1162                 return true;
1163         }
1164
1165         /* Okay, so this is hash_expected and hash_location -
1166          * so we need to calculate the checksum.
1167          */
1168         genhash = tcp_v4_md5_hash_skb(newhash,
1169                                       hash_expected,
1170                                       NULL, skb);
1171
1172         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1173                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1174                                      &iph->saddr, ntohs(th->source),
1175                                      &iph->daddr, ntohs(th->dest),
1176                                      genhash ? " tcp_v4_calc_md5_hash failed"
1177                                      : "");
1178                 return true;
1179         }
1180         return false;
1181 #endif
1182         return false;
1183 }
1184
1185 static void tcp_v4_init_req(struct request_sock *req,
1186                             const struct sock *sk_listener,
1187                             struct sk_buff *skb)
1188 {
1189         struct inet_request_sock *ireq = inet_rsk(req);
1190
1191         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1192         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1193         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1194         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(skb));
1195 }
1196
1197 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1198                                           struct flowi *fl,
1199                                           const struct request_sock *req,
1200                                           bool *strict)
1201 {
1202         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1203
1204         if (strict) {
1205                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1206                         *strict = true;
1207                 else
1208                         *strict = false;
1209         }
1210
1211         return dst;
1212 }
1213
1214 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1215         .family         =       PF_INET,
1216         .obj_size       =       sizeof(struct tcp_request_sock),
1217         .rtx_syn_ack    =       tcp_rtx_synack,
1218         .send_ack       =       tcp_v4_reqsk_send_ack,
1219         .destructor     =       tcp_v4_reqsk_destructor,
1220         .send_reset     =       tcp_v4_send_reset,
1221         .syn_ack_timeout =      tcp_syn_ack_timeout,
1222 };
1223
1224 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1225         .mss_clamp      =       TCP_MSS_DEFAULT,
1226 #ifdef CONFIG_TCP_MD5SIG
1227         .req_md5_lookup =       tcp_v4_md5_lookup,
1228         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1229 #endif
1230         .init_req       =       tcp_v4_init_req,
1231 #ifdef CONFIG_SYN_COOKIES
1232         .cookie_init_seq =      cookie_v4_init_sequence,
1233 #endif
1234         .route_req      =       tcp_v4_route_req,
1235         .init_seq       =       tcp_v4_init_sequence,
1236         .send_synack    =       tcp_v4_send_synack,
1237 };
1238
1239 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1240 {
1241         /* Never answer to SYNs send to broadcast or multicast */
1242         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1243                 goto drop;
1244
1245         return tcp_conn_request(&tcp_request_sock_ops,
1246                                 &tcp_request_sock_ipv4_ops, sk, skb);
1247
1248 drop:
1249         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1250         return 0;
1251 }
1252 EXPORT_SYMBOL(tcp_v4_conn_request);
1253
1254
1255 /*
1256  * The three way handshake has completed - we got a valid synack -
1257  * now create the new socket.
1258  */
1259 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1260                                   struct request_sock *req,
1261                                   struct dst_entry *dst,
1262                                   struct request_sock *req_unhash,
1263                                   bool *own_req)
1264 {
1265         struct inet_request_sock *ireq;
1266         struct inet_sock *newinet;
1267         struct tcp_sock *newtp;
1268         struct sock *newsk;
1269 #ifdef CONFIG_TCP_MD5SIG
1270         struct tcp_md5sig_key *key;
1271 #endif
1272         struct ip_options_rcu *inet_opt;
1273
1274         if (sk_acceptq_is_full(sk))
1275                 goto exit_overflow;
1276
1277         newsk = tcp_create_openreq_child(sk, req, skb);
1278         if (!newsk)
1279                 goto exit_nonewsk;
1280
1281         newsk->sk_gso_type = SKB_GSO_TCPV4;
1282         inet_sk_rx_dst_set(newsk, skb);
1283
1284         newtp                 = tcp_sk(newsk);
1285         newinet               = inet_sk(newsk);
1286         ireq                  = inet_rsk(req);
1287         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1288         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1289         newinet->inet_saddr   = ireq->ir_loc_addr;
1290         inet_opt              = rcu_dereference(ireq->ireq_opt);
1291         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1292         newinet->mc_index     = inet_iif(skb);
1293         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1294         newinet->rcv_tos      = ip_hdr(skb)->tos;
1295         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1296         if (inet_opt)
1297                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1298         newinet->inet_id = newtp->write_seq ^ jiffies;
1299
1300         if (!dst) {
1301                 dst = inet_csk_route_child_sock(sk, newsk, req);
1302                 if (!dst)
1303                         goto put_and_exit;
1304         } else {
1305                 /* syncookie case : see end of cookie_v4_check() */
1306         }
1307         sk_setup_caps(newsk, dst);
1308
1309         tcp_ca_openreq_child(newsk, dst);
1310
1311         tcp_sync_mss(newsk, dst_mtu(dst));
1312         newtp->advmss = dst_metric_advmss(dst);
1313         if (tcp_sk(sk)->rx_opt.user_mss &&
1314             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1315                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1316
1317         tcp_initialize_rcv_mss(newsk);
1318
1319 #ifdef CONFIG_TCP_MD5SIG
1320         /* Copy over the MD5 key from the original socket */
1321         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1322                                 AF_INET);
1323         if (key) {
1324                 /*
1325                  * We're using one, so create a matching key
1326                  * on the newsk structure. If we fail to get
1327                  * memory, then we end up not copying the key
1328                  * across. Shucks.
1329                  */
1330                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1331                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1332                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1333         }
1334 #endif
1335
1336         if (__inet_inherit_port(sk, newsk) < 0)
1337                 goto put_and_exit;
1338         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1339         if (likely(*own_req)) {
1340                 tcp_move_syn(newtp, req);
1341                 ireq->ireq_opt = NULL;
1342         } else {
1343                 newinet->inet_opt = NULL;
1344         }
1345         return newsk;
1346
1347 exit_overflow:
1348         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1349 exit_nonewsk:
1350         dst_release(dst);
1351 exit:
1352         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1353         return NULL;
1354 put_and_exit:
1355         newinet->inet_opt = NULL;
1356         inet_csk_prepare_forced_close(newsk);
1357         tcp_done(newsk);
1358         goto exit;
1359 }
1360 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1361
1362 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1363 {
1364 #ifdef CONFIG_SYN_COOKIES
1365         const struct tcphdr *th = tcp_hdr(skb);
1366
1367         if (!th->syn)
1368                 sk = cookie_v4_check(sk, skb);
1369 #endif
1370         return sk;
1371 }
1372
1373 /* The socket must have it's spinlock held when we get
1374  * here, unless it is a TCP_LISTEN socket.
1375  *
1376  * We have a potential double-lock case here, so even when
1377  * doing backlog processing we use the BH locking scheme.
1378  * This is because we cannot sleep with the original spinlock
1379  * held.
1380  */
1381 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1382 {
1383         struct sock *rsk;
1384
1385         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1386                 struct dst_entry *dst = sk->sk_rx_dst;
1387
1388                 sock_rps_save_rxhash(sk, skb);
1389                 sk_mark_napi_id(sk, skb);
1390                 if (dst) {
1391                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1392                             !dst->ops->check(dst, 0)) {
1393                                 dst_release(dst);
1394                                 sk->sk_rx_dst = NULL;
1395                         }
1396                 }
1397                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1398                 return 0;
1399         }
1400
1401         if (tcp_checksum_complete(skb))
1402                 goto csum_err;
1403
1404         if (sk->sk_state == TCP_LISTEN) {
1405                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1406
1407                 if (!nsk)
1408                         goto discard;
1409                 if (nsk != sk) {
1410                         sock_rps_save_rxhash(nsk, skb);
1411                         sk_mark_napi_id(nsk, skb);
1412                         if (tcp_child_process(sk, nsk, skb)) {
1413                                 rsk = nsk;
1414                                 goto reset;
1415                         }
1416                         return 0;
1417                 }
1418         } else
1419                 sock_rps_save_rxhash(sk, skb);
1420
1421         if (tcp_rcv_state_process(sk, skb)) {
1422                 rsk = sk;
1423                 goto reset;
1424         }
1425         return 0;
1426
1427 reset:
1428         tcp_v4_send_reset(rsk, skb);
1429 discard:
1430         kfree_skb(skb);
1431         /* Be careful here. If this function gets more complicated and
1432          * gcc suffers from register pressure on the x86, sk (in %ebx)
1433          * might be destroyed here. This current version compiles correctly,
1434          * but you have been warned.
1435          */
1436         return 0;
1437
1438 csum_err:
1439         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1440         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1441         goto discard;
1442 }
1443 EXPORT_SYMBOL(tcp_v4_do_rcv);
1444
1445 void tcp_v4_early_demux(struct sk_buff *skb)
1446 {
1447         const struct iphdr *iph;
1448         const struct tcphdr *th;
1449         struct sock *sk;
1450
1451         if (skb->pkt_type != PACKET_HOST)
1452                 return;
1453
1454         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1455                 return;
1456
1457         iph = ip_hdr(skb);
1458         th = tcp_hdr(skb);
1459
1460         if (th->doff < sizeof(struct tcphdr) / 4)
1461                 return;
1462
1463         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1464                                        iph->saddr, th->source,
1465                                        iph->daddr, ntohs(th->dest),
1466                                        skb->skb_iif);
1467         if (sk) {
1468                 skb->sk = sk;
1469                 skb->destructor = sock_edemux;
1470                 if (sk_fullsock(sk)) {
1471                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1472
1473                         if (dst)
1474                                 dst = dst_check(dst, 0);
1475                         if (dst &&
1476                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1477                                 skb_dst_set_noref(skb, dst);
1478                 }
1479         }
1480 }
1481
1482 /* Packet is added to VJ-style prequeue for processing in process
1483  * context, if a reader task is waiting. Apparently, this exciting
1484  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1485  * failed somewhere. Latency? Burstiness? Well, at least now we will
1486  * see, why it failed. 8)8)                               --ANK
1487  *
1488  */
1489 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1490 {
1491         struct tcp_sock *tp = tcp_sk(sk);
1492
1493         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1494                 return false;
1495
1496         if (skb->len <= tcp_hdrlen(skb) &&
1497             skb_queue_len(&tp->ucopy.prequeue) == 0)
1498                 return false;
1499
1500         /* Before escaping RCU protected region, we need to take care of skb
1501          * dst. Prequeue is only enabled for established sockets.
1502          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1503          * Instead of doing full sk_rx_dst validity here, let's perform
1504          * an optimistic check.
1505          */
1506         if (likely(sk->sk_rx_dst))
1507                 skb_dst_drop(skb);
1508         else
1509                 skb_dst_force_safe(skb);
1510
1511         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1512         tp->ucopy.memory += skb->truesize;
1513         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1514                 struct sk_buff *skb1;
1515
1516                 BUG_ON(sock_owned_by_user(sk));
1517
1518                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1519                         sk_backlog_rcv(sk, skb1);
1520                         NET_INC_STATS_BH(sock_net(sk),
1521                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1522                 }
1523
1524                 tp->ucopy.memory = 0;
1525         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1526                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1527                                            POLLIN | POLLRDNORM | POLLRDBAND);
1528                 if (!inet_csk_ack_scheduled(sk))
1529                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1530                                                   (3 * tcp_rto_min(sk)) / 4,
1531                                                   TCP_RTO_MAX);
1532         }
1533         return true;
1534 }
1535 EXPORT_SYMBOL(tcp_prequeue);
1536
1537 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1538 {
1539         struct tcphdr *th = (struct tcphdr *)skb->data;
1540         unsigned int eaten = skb->len;
1541         int err;
1542
1543         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1544         if (!err) {
1545                 eaten -= skb->len;
1546                 TCP_SKB_CB(skb)->end_seq -= eaten;
1547         }
1548         return err;
1549 }
1550 EXPORT_SYMBOL(tcp_filter);
1551
1552 /*
1553  *      From tcp_input.c
1554  */
1555
1556 int tcp_v4_rcv(struct sk_buff *skb)
1557 {
1558         const struct iphdr *iph;
1559         const struct tcphdr *th;
1560         struct sock *sk;
1561         int ret;
1562         struct net *net = dev_net(skb->dev);
1563
1564         if (skb->pkt_type != PACKET_HOST)
1565                 goto discard_it;
1566
1567         /* Count it even if it's bad */
1568         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1569
1570         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1571                 goto discard_it;
1572
1573         th = tcp_hdr(skb);
1574
1575         if (th->doff < sizeof(struct tcphdr) / 4)
1576                 goto bad_packet;
1577         if (!pskb_may_pull(skb, th->doff * 4))
1578                 goto discard_it;
1579
1580         /* An explanation is required here, I think.
1581          * Packet length and doff are validated by header prediction,
1582          * provided case of th->doff==0 is eliminated.
1583          * So, we defer the checks. */
1584
1585         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1586                 goto csum_error;
1587
1588         th = tcp_hdr(skb);
1589         iph = ip_hdr(skb);
1590         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1591          * barrier() makes sure compiler wont play fool^Waliasing games.
1592          */
1593         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1594                 sizeof(struct inet_skb_parm));
1595         barrier();
1596
1597         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1598         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1599                                     skb->len - th->doff * 4);
1600         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1601         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1602         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1603         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1604         TCP_SKB_CB(skb)->sacked  = 0;
1605
1606 lookup:
1607         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1608         if (!sk)
1609                 goto no_tcp_socket;
1610
1611 process:
1612         if (sk->sk_state == TCP_TIME_WAIT)
1613                 goto do_time_wait;
1614
1615         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1616                 struct request_sock *req = inet_reqsk(sk);
1617                 struct sock *nsk;
1618
1619                 sk = req->rsk_listener;
1620                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1621                         reqsk_put(req);
1622                         goto discard_it;
1623                 }
1624                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1625                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1626                         goto lookup;
1627                 }
1628                 sock_hold(sk);
1629                 nsk = tcp_check_req(sk, skb, req, false);
1630                 if (!nsk) {
1631                         reqsk_put(req);
1632                         goto discard_and_relse;
1633                 }
1634                 if (nsk == sk) {
1635                         reqsk_put(req);
1636                 } else if (tcp_child_process(sk, nsk, skb)) {
1637                         tcp_v4_send_reset(nsk, skb);
1638                         goto discard_and_relse;
1639                 } else {
1640                         sock_put(sk);
1641                         return 0;
1642                 }
1643         }
1644         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1645                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1646                 goto discard_and_relse;
1647         }
1648
1649         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1650                 goto discard_and_relse;
1651
1652         if (tcp_v4_inbound_md5_hash(sk, skb))
1653                 goto discard_and_relse;
1654
1655         nf_reset(skb);
1656
1657         if (tcp_filter(sk, skb))
1658                 goto discard_and_relse;
1659         th = (const struct tcphdr *)skb->data;
1660         iph = ip_hdr(skb);
1661
1662         skb->dev = NULL;
1663
1664         if (sk->sk_state == TCP_LISTEN) {
1665                 ret = tcp_v4_do_rcv(sk, skb);
1666                 goto put_and_return;
1667         }
1668
1669         sk_incoming_cpu_update(sk);
1670
1671         bh_lock_sock_nested(sk);
1672         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1673         ret = 0;
1674         if (!sock_owned_by_user(sk)) {
1675                 if (!tcp_prequeue(sk, skb))
1676                         ret = tcp_v4_do_rcv(sk, skb);
1677         } else if (unlikely(sk_add_backlog(sk, skb,
1678                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1679                 bh_unlock_sock(sk);
1680                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1681                 goto discard_and_relse;
1682         }
1683         bh_unlock_sock(sk);
1684
1685 put_and_return:
1686         sock_put(sk);
1687
1688         return ret;
1689
1690 no_tcp_socket:
1691         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1692                 goto discard_it;
1693
1694         if (tcp_checksum_complete(skb)) {
1695 csum_error:
1696                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1697 bad_packet:
1698                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1699         } else {
1700                 tcp_v4_send_reset(NULL, skb);
1701         }
1702
1703 discard_it:
1704         /* Discard frame. */
1705         kfree_skb(skb);
1706         return 0;
1707
1708 discard_and_relse:
1709         sock_put(sk);
1710         goto discard_it;
1711
1712 do_time_wait:
1713         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1714                 inet_twsk_put(inet_twsk(sk));
1715                 goto discard_it;
1716         }
1717
1718         if (tcp_checksum_complete(skb)) {
1719                 inet_twsk_put(inet_twsk(sk));
1720                 goto csum_error;
1721         }
1722         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1723         case TCP_TW_SYN: {
1724                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1725                                                         &tcp_hashinfo,
1726                                                         iph->saddr, th->source,
1727                                                         iph->daddr, th->dest,
1728                                                         inet_iif(skb));
1729                 if (sk2) {
1730                         inet_twsk_deschedule_put(inet_twsk(sk));
1731                         sk = sk2;
1732                         goto process;
1733                 }
1734                 /* Fall through to ACK */
1735         }
1736         case TCP_TW_ACK:
1737                 tcp_v4_timewait_ack(sk, skb);
1738                 break;
1739         case TCP_TW_RST:
1740                 goto no_tcp_socket;
1741         case TCP_TW_SUCCESS:;
1742         }
1743         goto discard_it;
1744 }
1745
1746 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1747         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1748         .twsk_unique    = tcp_twsk_unique,
1749         .twsk_destructor= tcp_twsk_destructor,
1750 };
1751
1752 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1753 {
1754         struct dst_entry *dst = skb_dst(skb);
1755
1756         if (dst && dst_hold_safe(dst)) {
1757                 sk->sk_rx_dst = dst;
1758                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1759         }
1760 }
1761 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1762
1763 const struct inet_connection_sock_af_ops ipv4_specific = {
1764         .queue_xmit        = ip_queue_xmit,
1765         .send_check        = tcp_v4_send_check,
1766         .rebuild_header    = inet_sk_rebuild_header,
1767         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1768         .conn_request      = tcp_v4_conn_request,
1769         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1770         .net_header_len    = sizeof(struct iphdr),
1771         .setsockopt        = ip_setsockopt,
1772         .getsockopt        = ip_getsockopt,
1773         .addr2sockaddr     = inet_csk_addr2sockaddr,
1774         .sockaddr_len      = sizeof(struct sockaddr_in),
1775         .bind_conflict     = inet_csk_bind_conflict,
1776 #ifdef CONFIG_COMPAT
1777         .compat_setsockopt = compat_ip_setsockopt,
1778         .compat_getsockopt = compat_ip_getsockopt,
1779 #endif
1780         .mtu_reduced       = tcp_v4_mtu_reduced,
1781 };
1782 EXPORT_SYMBOL(ipv4_specific);
1783
1784 #ifdef CONFIG_TCP_MD5SIG
1785 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1786         .md5_lookup             = tcp_v4_md5_lookup,
1787         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1788         .md5_parse              = tcp_v4_parse_md5_keys,
1789 };
1790 #endif
1791
1792 /* NOTE: A lot of things set to zero explicitly by call to
1793  *       sk_alloc() so need not be done here.
1794  */
1795 static int tcp_v4_init_sock(struct sock *sk)
1796 {
1797         struct inet_connection_sock *icsk = inet_csk(sk);
1798
1799         tcp_init_sock(sk);
1800
1801         icsk->icsk_af_ops = &ipv4_specific;
1802
1803 #ifdef CONFIG_TCP_MD5SIG
1804         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1805 #endif
1806
1807         return 0;
1808 }
1809
1810 void tcp_v4_destroy_sock(struct sock *sk)
1811 {
1812         struct tcp_sock *tp = tcp_sk(sk);
1813
1814         tcp_clear_xmit_timers(sk);
1815
1816         tcp_cleanup_congestion_control(sk);
1817
1818         /* Cleanup up the write buffer. */
1819         tcp_write_queue_purge(sk);
1820
1821         /* Cleans up our, hopefully empty, out_of_order_queue. */
1822         __skb_queue_purge(&tp->out_of_order_queue);
1823
1824 #ifdef CONFIG_TCP_MD5SIG
1825         /* Clean up the MD5 key list, if any */
1826         if (tp->md5sig_info) {
1827                 tcp_clear_md5_list(sk);
1828                 kfree_rcu(tp->md5sig_info, rcu);
1829                 tp->md5sig_info = NULL;
1830         }
1831 #endif
1832
1833         /* Clean prequeue, it must be empty really */
1834         __skb_queue_purge(&tp->ucopy.prequeue);
1835
1836         /* Clean up a referenced TCP bind bucket. */
1837         if (inet_csk(sk)->icsk_bind_hash)
1838                 inet_put_port(sk);
1839
1840         BUG_ON(tp->fastopen_rsk);
1841
1842         /* If socket is aborted during connect operation */
1843         tcp_free_fastopen_req(tp);
1844         tcp_saved_syn_free(tp);
1845
1846         sk_sockets_allocated_dec(sk);
1847         sock_release_memcg(sk);
1848 }
1849 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1850
1851 #ifdef CONFIG_PROC_FS
1852 /* Proc filesystem TCP sock list dumping. */
1853
1854 /*
1855  * Get next listener socket follow cur.  If cur is NULL, get first socket
1856  * starting from bucket given in st->bucket; when st->bucket is zero the
1857  * very first socket in the hash table is returned.
1858  */
1859 static void *listening_get_next(struct seq_file *seq, void *cur)
1860 {
1861         struct inet_connection_sock *icsk;
1862         struct hlist_nulls_node *node;
1863         struct sock *sk = cur;
1864         struct inet_listen_hashbucket *ilb;
1865         struct tcp_iter_state *st = seq->private;
1866         struct net *net = seq_file_net(seq);
1867
1868         if (!sk) {
1869                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1870                 spin_lock_bh(&ilb->lock);
1871                 sk = sk_nulls_head(&ilb->head);
1872                 st->offset = 0;
1873                 goto get_sk;
1874         }
1875         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1876         ++st->num;
1877         ++st->offset;
1878
1879         sk = sk_nulls_next(sk);
1880 get_sk:
1881         sk_nulls_for_each_from(sk, node) {
1882                 if (!net_eq(sock_net(sk), net))
1883                         continue;
1884                 if (sk->sk_family == st->family) {
1885                         cur = sk;
1886                         goto out;
1887                 }
1888                 icsk = inet_csk(sk);
1889         }
1890         spin_unlock_bh(&ilb->lock);
1891         st->offset = 0;
1892         if (++st->bucket < INET_LHTABLE_SIZE) {
1893                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1894                 spin_lock_bh(&ilb->lock);
1895                 sk = sk_nulls_head(&ilb->head);
1896                 goto get_sk;
1897         }
1898         cur = NULL;
1899 out:
1900         return cur;
1901 }
1902
1903 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1904 {
1905         struct tcp_iter_state *st = seq->private;
1906         void *rc;
1907
1908         st->bucket = 0;
1909         st->offset = 0;
1910         rc = listening_get_next(seq, NULL);
1911
1912         while (rc && *pos) {
1913                 rc = listening_get_next(seq, rc);
1914                 --*pos;
1915         }
1916         return rc;
1917 }
1918
1919 static inline bool empty_bucket(const struct tcp_iter_state *st)
1920 {
1921         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1922 }
1923
1924 /*
1925  * Get first established socket starting from bucket given in st->bucket.
1926  * If st->bucket is zero, the very first socket in the hash is returned.
1927  */
1928 static void *established_get_first(struct seq_file *seq)
1929 {
1930         struct tcp_iter_state *st = seq->private;
1931         struct net *net = seq_file_net(seq);
1932         void *rc = NULL;
1933
1934         st->offset = 0;
1935         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1936                 struct sock *sk;
1937                 struct hlist_nulls_node *node;
1938                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1939
1940                 /* Lockless fast path for the common case of empty buckets */
1941                 if (empty_bucket(st))
1942                         continue;
1943
1944                 spin_lock_bh(lock);
1945                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1946                         if (sk->sk_family != st->family ||
1947                             !net_eq(sock_net(sk), net)) {
1948                                 continue;
1949                         }
1950                         rc = sk;
1951                         goto out;
1952                 }
1953                 spin_unlock_bh(lock);
1954         }
1955 out:
1956         return rc;
1957 }
1958
1959 static void *established_get_next(struct seq_file *seq, void *cur)
1960 {
1961         struct sock *sk = cur;
1962         struct hlist_nulls_node *node;
1963         struct tcp_iter_state *st = seq->private;
1964         struct net *net = seq_file_net(seq);
1965
1966         ++st->num;
1967         ++st->offset;
1968
1969         sk = sk_nulls_next(sk);
1970
1971         sk_nulls_for_each_from(sk, node) {
1972                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1973                         return sk;
1974         }
1975
1976         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1977         ++st->bucket;
1978         return established_get_first(seq);
1979 }
1980
1981 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1982 {
1983         struct tcp_iter_state *st = seq->private;
1984         void *rc;
1985
1986         st->bucket = 0;
1987         rc = established_get_first(seq);
1988
1989         while (rc && pos) {
1990                 rc = established_get_next(seq, rc);
1991                 --pos;
1992         }
1993         return rc;
1994 }
1995
1996 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1997 {
1998         void *rc;
1999         struct tcp_iter_state *st = seq->private;
2000
2001         st->state = TCP_SEQ_STATE_LISTENING;
2002         rc        = listening_get_idx(seq, &pos);
2003
2004         if (!rc) {
2005                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2006                 rc        = established_get_idx(seq, pos);
2007         }
2008
2009         return rc;
2010 }
2011
2012 static void *tcp_seek_last_pos(struct seq_file *seq)
2013 {
2014         struct tcp_iter_state *st = seq->private;
2015         int offset = st->offset;
2016         int orig_num = st->num;
2017         void *rc = NULL;
2018
2019         switch (st->state) {
2020         case TCP_SEQ_STATE_LISTENING:
2021                 if (st->bucket >= INET_LHTABLE_SIZE)
2022                         break;
2023                 st->state = TCP_SEQ_STATE_LISTENING;
2024                 rc = listening_get_next(seq, NULL);
2025                 while (offset-- && rc)
2026                         rc = listening_get_next(seq, rc);
2027                 if (rc)
2028                         break;
2029                 st->bucket = 0;
2030                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2031                 /* Fallthrough */
2032         case TCP_SEQ_STATE_ESTABLISHED:
2033                 if (st->bucket > tcp_hashinfo.ehash_mask)
2034                         break;
2035                 rc = established_get_first(seq);
2036                 while (offset-- && rc)
2037                         rc = established_get_next(seq, rc);
2038         }
2039
2040         st->num = orig_num;
2041
2042         return rc;
2043 }
2044
2045 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2046 {
2047         struct tcp_iter_state *st = seq->private;
2048         void *rc;
2049
2050         if (*pos && *pos == st->last_pos) {
2051                 rc = tcp_seek_last_pos(seq);
2052                 if (rc)
2053                         goto out;
2054         }
2055
2056         st->state = TCP_SEQ_STATE_LISTENING;
2057         st->num = 0;
2058         st->bucket = 0;
2059         st->offset = 0;
2060         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2061
2062 out:
2063         st->last_pos = *pos;
2064         return rc;
2065 }
2066
2067 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2068 {
2069         struct tcp_iter_state *st = seq->private;
2070         void *rc = NULL;
2071
2072         if (v == SEQ_START_TOKEN) {
2073                 rc = tcp_get_idx(seq, 0);
2074                 goto out;
2075         }
2076
2077         switch (st->state) {
2078         case TCP_SEQ_STATE_LISTENING:
2079                 rc = listening_get_next(seq, v);
2080                 if (!rc) {
2081                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2082                         st->bucket = 0;
2083                         st->offset = 0;
2084                         rc        = established_get_first(seq);
2085                 }
2086                 break;
2087         case TCP_SEQ_STATE_ESTABLISHED:
2088                 rc = established_get_next(seq, v);
2089                 break;
2090         }
2091 out:
2092         ++*pos;
2093         st->last_pos = *pos;
2094         return rc;
2095 }
2096
2097 static void tcp_seq_stop(struct seq_file *seq, void *v)
2098 {
2099         struct tcp_iter_state *st = seq->private;
2100
2101         switch (st->state) {
2102         case TCP_SEQ_STATE_LISTENING:
2103                 if (v != SEQ_START_TOKEN)
2104                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2105                 break;
2106         case TCP_SEQ_STATE_ESTABLISHED:
2107                 if (v)
2108                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2109                 break;
2110         }
2111 }
2112
2113 int tcp_seq_open(struct inode *inode, struct file *file)
2114 {
2115         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2116         struct tcp_iter_state *s;
2117         int err;
2118
2119         err = seq_open_net(inode, file, &afinfo->seq_ops,
2120                           sizeof(struct tcp_iter_state));
2121         if (err < 0)
2122                 return err;
2123
2124         s = ((struct seq_file *)file->private_data)->private;
2125         s->family               = afinfo->family;
2126         s->last_pos             = 0;
2127         return 0;
2128 }
2129 EXPORT_SYMBOL(tcp_seq_open);
2130
2131 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2132 {
2133         int rc = 0;
2134         struct proc_dir_entry *p;
2135
2136         afinfo->seq_ops.start           = tcp_seq_start;
2137         afinfo->seq_ops.next            = tcp_seq_next;
2138         afinfo->seq_ops.stop            = tcp_seq_stop;
2139
2140         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2141                              afinfo->seq_fops, afinfo);
2142         if (!p)
2143                 rc = -ENOMEM;
2144         return rc;
2145 }
2146 EXPORT_SYMBOL(tcp_proc_register);
2147
2148 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2149 {
2150         remove_proc_entry(afinfo->name, net->proc_net);
2151 }
2152 EXPORT_SYMBOL(tcp_proc_unregister);
2153
2154 static void get_openreq4(const struct request_sock *req,
2155                          struct seq_file *f, int i)
2156 {
2157         const struct inet_request_sock *ireq = inet_rsk(req);
2158         long delta = req->rsk_timer.expires - jiffies;
2159
2160         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2161                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2162                 i,
2163                 ireq->ir_loc_addr,
2164                 ireq->ir_num,
2165                 ireq->ir_rmt_addr,
2166                 ntohs(ireq->ir_rmt_port),
2167                 TCP_SYN_RECV,
2168                 0, 0, /* could print option size, but that is af dependent. */
2169                 1,    /* timers active (only the expire timer) */
2170                 jiffies_delta_to_clock_t(delta),
2171                 req->num_timeout,
2172                 from_kuid_munged(seq_user_ns(f),
2173                                  sock_i_uid(req->rsk_listener)),
2174                 0,  /* non standard timer */
2175                 0, /* open_requests have no inode */
2176                 0,
2177                 req);
2178 }
2179
2180 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2181 {
2182         int timer_active;
2183         unsigned long timer_expires;
2184         const struct tcp_sock *tp = tcp_sk(sk);
2185         const struct inet_connection_sock *icsk = inet_csk(sk);
2186         const struct inet_sock *inet = inet_sk(sk);
2187         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2188         __be32 dest = inet->inet_daddr;
2189         __be32 src = inet->inet_rcv_saddr;
2190         __u16 destp = ntohs(inet->inet_dport);
2191         __u16 srcp = ntohs(inet->inet_sport);
2192         int rx_queue;
2193         int state;
2194
2195         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2196             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2197             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2198                 timer_active    = 1;
2199                 timer_expires   = icsk->icsk_timeout;
2200         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2201                 timer_active    = 4;
2202                 timer_expires   = icsk->icsk_timeout;
2203         } else if (timer_pending(&sk->sk_timer)) {
2204                 timer_active    = 2;
2205                 timer_expires   = sk->sk_timer.expires;
2206         } else {
2207                 timer_active    = 0;
2208                 timer_expires = jiffies;
2209         }
2210
2211         state = sk_state_load(sk);
2212         if (state == TCP_LISTEN)
2213                 rx_queue = sk->sk_ack_backlog;
2214         else
2215                 /* Because we don't lock the socket,
2216                  * we might find a transient negative value.
2217                  */
2218                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2219
2220         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2221                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2222                 i, src, srcp, dest, destp, state,
2223                 tp->write_seq - tp->snd_una,
2224                 rx_queue,
2225                 timer_active,
2226                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2227                 icsk->icsk_retransmits,
2228                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2229                 icsk->icsk_probes_out,
2230                 sock_i_ino(sk),
2231                 atomic_read(&sk->sk_refcnt), sk,
2232                 jiffies_to_clock_t(icsk->icsk_rto),
2233                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2234                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2235                 tp->snd_cwnd,
2236                 state == TCP_LISTEN ?
2237                     fastopenq->max_qlen :
2238                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2239 }
2240
2241 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2242                                struct seq_file *f, int i)
2243 {
2244         long delta = tw->tw_timer.expires - jiffies;
2245         __be32 dest, src;
2246         __u16 destp, srcp;
2247
2248         dest  = tw->tw_daddr;
2249         src   = tw->tw_rcv_saddr;
2250         destp = ntohs(tw->tw_dport);
2251         srcp  = ntohs(tw->tw_sport);
2252
2253         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2254                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2255                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2256                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2257                 atomic_read(&tw->tw_refcnt), tw);
2258 }
2259
2260 #define TMPSZ 150
2261
2262 static int tcp4_seq_show(struct seq_file *seq, void *v)
2263 {
2264         struct tcp_iter_state *st;
2265         struct sock *sk = v;
2266
2267         seq_setwidth(seq, TMPSZ - 1);
2268         if (v == SEQ_START_TOKEN) {
2269                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2270                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2271                            "inode");
2272                 goto out;
2273         }
2274         st = seq->private;
2275
2276         if (sk->sk_state == TCP_TIME_WAIT)
2277                 get_timewait4_sock(v, seq, st->num);
2278         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2279                 get_openreq4(v, seq, st->num);
2280         else
2281                 get_tcp4_sock(v, seq, st->num);
2282 out:
2283         seq_pad(seq, '\n');
2284         return 0;
2285 }
2286
2287 static const struct file_operations tcp_afinfo_seq_fops = {
2288         .owner   = THIS_MODULE,
2289         .open    = tcp_seq_open,
2290         .read    = seq_read,
2291         .llseek  = seq_lseek,
2292         .release = seq_release_net
2293 };
2294
2295 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2296         .name           = "tcp",
2297         .family         = AF_INET,
2298         .seq_fops       = &tcp_afinfo_seq_fops,
2299         .seq_ops        = {
2300                 .show           = tcp4_seq_show,
2301         },
2302 };
2303
2304 static int __net_init tcp4_proc_init_net(struct net *net)
2305 {
2306         return tcp_proc_register(net, &tcp4_seq_afinfo);
2307 }
2308
2309 static void __net_exit tcp4_proc_exit_net(struct net *net)
2310 {
2311         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2312 }
2313
2314 static struct pernet_operations tcp4_net_ops = {
2315         .init = tcp4_proc_init_net,
2316         .exit = tcp4_proc_exit_net,
2317 };
2318
2319 int __init tcp4_proc_init(void)
2320 {
2321         return register_pernet_subsys(&tcp4_net_ops);
2322 }
2323
2324 void tcp4_proc_exit(void)
2325 {
2326         unregister_pernet_subsys(&tcp4_net_ops);
2327 }
2328 #endif /* CONFIG_PROC_FS */
2329
2330 struct proto tcp_prot = {
2331         .name                   = "TCP",
2332         .owner                  = THIS_MODULE,
2333         .close                  = tcp_close,
2334         .connect                = tcp_v4_connect,
2335         .disconnect             = tcp_disconnect,
2336         .accept                 = inet_csk_accept,
2337         .ioctl                  = tcp_ioctl,
2338         .init                   = tcp_v4_init_sock,
2339         .destroy                = tcp_v4_destroy_sock,
2340         .shutdown               = tcp_shutdown,
2341         .setsockopt             = tcp_setsockopt,
2342         .getsockopt             = tcp_getsockopt,
2343         .recvmsg                = tcp_recvmsg,
2344         .sendmsg                = tcp_sendmsg,
2345         .sendpage               = tcp_sendpage,
2346         .backlog_rcv            = tcp_v4_do_rcv,
2347         .release_cb             = tcp_release_cb,
2348         .hash                   = inet_hash,
2349         .unhash                 = inet_unhash,
2350         .get_port               = inet_csk_get_port,
2351         .enter_memory_pressure  = tcp_enter_memory_pressure,
2352         .stream_memory_free     = tcp_stream_memory_free,
2353         .sockets_allocated      = &tcp_sockets_allocated,
2354         .orphan_count           = &tcp_orphan_count,
2355         .memory_allocated       = &tcp_memory_allocated,
2356         .memory_pressure        = &tcp_memory_pressure,
2357         .sysctl_mem             = sysctl_tcp_mem,
2358         .sysctl_wmem            = sysctl_tcp_wmem,
2359         .sysctl_rmem            = sysctl_tcp_rmem,
2360         .max_header             = MAX_TCP_HEADER,
2361         .obj_size               = sizeof(struct tcp_sock),
2362         .slab_flags             = SLAB_DESTROY_BY_RCU,
2363         .twsk_prot              = &tcp_timewait_sock_ops,
2364         .rsk_prot               = &tcp_request_sock_ops,
2365         .h.hashinfo             = &tcp_hashinfo,
2366         .no_autobind            = true,
2367 #ifdef CONFIG_COMPAT
2368         .compat_setsockopt      = compat_tcp_setsockopt,
2369         .compat_getsockopt      = compat_tcp_getsockopt,
2370 #endif
2371 #ifdef CONFIG_MEMCG_KMEM
2372         .init_cgroup            = tcp_init_cgroup,
2373         .destroy_cgroup         = tcp_destroy_cgroup,
2374         .proto_cgroup           = tcp_proto_cgroup,
2375 #endif
2376         .diag_destroy           = tcp_abort,
2377 };
2378 EXPORT_SYMBOL(tcp_prot);
2379
2380 static void __net_exit tcp_sk_exit(struct net *net)
2381 {
2382         int cpu;
2383
2384         for_each_possible_cpu(cpu)
2385                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2386         free_percpu(net->ipv4.tcp_sk);
2387 }
2388
2389 static int __net_init tcp_sk_init(struct net *net)
2390 {
2391         int res, cpu;
2392
2393         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2394         if (!net->ipv4.tcp_sk)
2395                 return -ENOMEM;
2396
2397         for_each_possible_cpu(cpu) {
2398                 struct sock *sk;
2399
2400                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2401                                            IPPROTO_TCP, net);
2402                 if (res)
2403                         goto fail;
2404                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2405         }
2406
2407         net->ipv4.sysctl_tcp_ecn = 2;
2408         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2409
2410         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2411         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2412         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2413
2414         return 0;
2415 fail:
2416         tcp_sk_exit(net);
2417
2418         return res;
2419 }
2420
2421 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2422 {
2423         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2424 }
2425
2426 static struct pernet_operations __net_initdata tcp_sk_ops = {
2427        .init       = tcp_sk_init,
2428        .exit       = tcp_sk_exit,
2429        .exit_batch = tcp_sk_exit_batch,
2430 };
2431
2432 void __init tcp_v4_init(void)
2433 {
2434         inet_hashinfo_init(&tcp_hashinfo);
2435         if (register_pernet_subsys(&tcp_sk_ops))
2436                 panic("Failed to create the TCP control socket.\n");
2437 }