net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/filter.h>
  93 #include <linux/termios.h>
  94 #include <linux/sockios.h>
  95 #include <linux/net.h>
  96 #include <linux/in.h>
  97 #include <linux/fs.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <net/net_namespace.h>
 103 #include <net/sock.h>
 104 #include <net/tcp_states.h>
 105 #include <net/af_unix.h>
 106 #include <linux/proc_fs.h>
 107 #include <linux/seq_file.h>
 108 #include <net/scm.h>
 109 #include <linux/init.h>
 110 #include <linux/poll.h>
 111 #include <linux/rtnetlink.h>
 112 #include <linux/mount.h>
 113 #include <net/checksum.h>
 114 #include <linux/security.h>
 115 #include <linux/splice.h>
 116 #include <linux/freezer.h>
 117 #include <linux/file.h>
 118 #include <linux/btf_ids.h>
 119
 120 #include "scm.h"
 121
 122 static atomic_long_t unix_nr_socks;
 123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 125
 126 /* SMP locking strategy:
 127  *    hash table is protected with spinlock.
 128  *    each socket state is protected by separate spinlock.
 129  */
 130
 131 static unsigned int unix_unbound_hash(struct sock *sk)
 132 {
 133         unsigned long hash = (unsigned long)sk;
 134
 135         hash ^= hash >> 16;
 136         hash ^= hash >> 8;
 137         hash ^= sk->sk_type;
 138
 139         return hash & UNIX_HASH_MOD;
 140 }
 141
 142 static unsigned int unix_bsd_hash(struct inode *i)
 143 {
 144         return i->i_ino & UNIX_HASH_MOD;
 145 }
 146
 147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 148                                        int addr_len, int type)
 149 {
 150         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 151         unsigned int hash;
 152
 153         hash = (__force unsigned int)csum_fold(csum);
 154         hash ^= hash >> 8;
 155         hash ^= type;
 156
 157         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 158 }
 159
 160 static void unix_table_double_lock(struct net *net,
 161                                    unsigned int hash1, unsigned int hash2)
 162 {
 163         if (hash1 == hash2) {
 164                 spin_lock(&net->unx.table.locks[hash1]);
 165                 return;
 166         }
 167
 168         if (hash1 > hash2)
 169                 swap(hash1, hash2);
 170
 171         spin_lock(&net->unx.table.locks[hash1]);
 172         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
 173 }
 174
 175 static void unix_table_double_unlock(struct net *net,
 176                                      unsigned int hash1, unsigned int hash2)
 177 {
 178         if (hash1 == hash2) {
 179                 spin_unlock(&net->unx.table.locks[hash1]);
 180                 return;
 181         }
 182
 183         spin_unlock(&net->unx.table.locks[hash1]);
 184         spin_unlock(&net->unx.table.locks[hash2]);
 185 }
 186
 187 #ifdef CONFIG_SECURITY_NETWORK
 188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 189 {
 190         UNIXCB(skb).secid = scm->secid;
 191 }
 192
 193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 194 {
 195         scm->secid = UNIXCB(skb).secid;
 196 }
 197
 198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 199 {
 200         return (scm->secid == UNIXCB(skb).secid);
 201 }
 202 #else
 203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 204 { }
 205
 206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 207 { }
 208
 209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 210 {
 211         return true;
 212 }
 213 #endif /* CONFIG_SECURITY_NETWORK */
 214
 215 #define unix_peer(sk) (unix_sk(sk)->peer)
 216
 217 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 218 {
 219         return unix_peer(osk) == sk;
 220 }
 221
 222 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 223 {
 224         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 225 }
 226
 227 static inline int unix_recvq_full(const struct sock *sk)
 228 {
 229         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 230 }
 231
 232 static inline int unix_recvq_full_lockless(const struct sock *sk)
 233 {
 234         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 235                 READ_ONCE(sk->sk_max_ack_backlog);
 236 }
 237
 238 struct sock *unix_peer_get(struct sock *s)
 239 {
 240         struct sock *peer;
 241
 242         unix_state_lock(s);
 243         peer = unix_peer(s);
 244         if (peer)
 245                 sock_hold(peer);
 246         unix_state_unlock(s);
 247         return peer;
 248 }
 249 EXPORT_SYMBOL_GPL(unix_peer_get);
 250
 251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 252                                              int addr_len)
 253 {
 254         struct unix_address *addr;
 255
 256         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 257         if (!addr)
 258                 return NULL;
 259
 260         refcount_set(&addr->refcnt, 1);
 261         addr->len = addr_len;
 262         memcpy(addr->name, sunaddr, addr_len);
 263
 264         return addr;
 265 }
 266
 267 static inline void unix_release_addr(struct unix_address *addr)
 268 {
 269         if (refcount_dec_and_test(&addr->refcnt))
 270                 kfree(addr);
 271 }
 272
 273 /*
 274  *      Check unix socket name:
 275  *              - should be not zero length.
 276  *              - if started by not zero, should be NULL terminated (FS object)
 277  *              - if started by zero, it is abstract name.
 278  */
 279
 280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 281 {
 282         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 283             addr_len > sizeof(*sunaddr))
 284                 return -EINVAL;
 285
 286         if (sunaddr->sun_family != AF_UNIX)
 287                 return -EINVAL;
 288
 289         return 0;
 290 }
 291
 292 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 293 {
 294         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
 295         short offset = offsetof(struct sockaddr_storage, __data);
 296
 297         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
 298
 299         /* This may look like an off by one error but it is a bit more
 300          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 301          * sun_path[108] doesn't as such exist.  However in kernel space
 302          * we are guaranteed that it is a valid memory location in our
 303          * kernel address buffer because syscall functions always pass
 304          * a pointer of struct sockaddr_storage which has a bigger buffer
 305          * than 108.  Also, we must terminate sun_path for strlen() in
 306          * getname_kernel().
 307          */
 308         addr->__data[addr_len - offset] = 0;
 309
 310         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
 311          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
 312          * know the actual buffer.
 313          */
 314         return strlen(addr->__data) + offset + 1;
 315 }
 316
 317 static void __unix_remove_socket(struct sock *sk)
 318 {
 319         sk_del_node_init(sk);
 320 }
 321
 322 static void __unix_insert_socket(struct net *net, struct sock *sk)
 323 {
 324         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 325         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 326 }
 327
 328 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 329                                  struct unix_address *addr, unsigned int hash)
 330 {
 331         __unix_remove_socket(sk);
 332         smp_store_release(&unix_sk(sk)->addr, addr);
 333
 334         sk->sk_hash = hash;
 335         __unix_insert_socket(net, sk);
 336 }
 337
 338 static void unix_remove_socket(struct net *net, struct sock *sk)
 339 {
 340         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 341         __unix_remove_socket(sk);
 342         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 343 }
 344
 345 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 346 {
 347         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 348         __unix_insert_socket(net, sk);
 349         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 350 }
 351
 352 static void unix_insert_bsd_socket(struct sock *sk)
 353 {
 354         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 355         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 356         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 357 }
 358
 359 static void unix_remove_bsd_socket(struct sock *sk)
 360 {
 361         if (!hlist_unhashed(&sk->sk_bind_node)) {
 362                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 363                 __sk_del_bind_node(sk);
 364                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 365
 366                 sk_node_init(&sk->sk_bind_node);
 367         }
 368 }
 369
 370 static struct sock *__unix_find_socket_byname(struct net *net,
 371                                               struct sockaddr_un *sunname,
 372                                               int len, unsigned int hash)
 373 {
 374         struct sock *s;
 375
 376         sk_for_each(s, &net->unx.table.buckets[hash]) {
 377                 struct unix_sock *u = unix_sk(s);
 378
 379                 if (u->addr->len == len &&
 380                     !memcmp(u->addr->name, sunname, len))
 381                         return s;
 382         }
 383         return NULL;
 384 }
 385
 386 static inline struct sock *unix_find_socket_byname(struct net *net,
 387                                                    struct sockaddr_un *sunname,
 388                                                    int len, unsigned int hash)
 389 {
 390         struct sock *s;
 391
 392         spin_lock(&net->unx.table.locks[hash]);
 393         s = __unix_find_socket_byname(net, sunname, len, hash);
 394         if (s)
 395                 sock_hold(s);
 396         spin_unlock(&net->unx.table.locks[hash]);
 397         return s;
 398 }
 399
 400 static struct sock *unix_find_socket_byinode(struct inode *i)
 401 {
 402         unsigned int hash = unix_bsd_hash(i);
 403         struct sock *s;
 404
 405         spin_lock(&bsd_socket_locks[hash]);
 406         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 407                 struct dentry *dentry = unix_sk(s)->path.dentry;
 408
 409                 if (dentry && d_backing_inode(dentry) == i) {
 410                         sock_hold(s);
 411                         spin_unlock(&bsd_socket_locks[hash]);
 412                         return s;
 413                 }
 414         }
 415         spin_unlock(&bsd_socket_locks[hash]);
 416         return NULL;
 417 }
 418
 419 /* Support code for asymmetrically connected dgram sockets
 420  *
 421  * If a datagram socket is connected to a socket not itself connected
 422  * to the first socket (eg, /dev/log), clients may only enqueue more
 423  * messages if the present receive queue of the server socket is not
 424  * "too large". This means there's a second writeability condition
 425  * poll and sendmsg need to test. The dgram recv code will do a wake
 426  * up on the peer_wait wait queue of a socket upon reception of a
 427  * datagram which needs to be propagated to sleeping would-be writers
 428  * since these might not have sent anything so far. This can't be
 429  * accomplished via poll_wait because the lifetime of the server
 430  * socket might be less than that of its clients if these break their
 431  * association with it or if the server socket is closed while clients
 432  * are still connected to it and there's no way to inform "a polling
 433  * implementation" that it should let go of a certain wait queue
 434  *
 435  * In order to propagate a wake up, a wait_queue_entry_t of the client
 436  * socket is enqueued on the peer_wait queue of the server socket
 437  * whose wake function does a wake_up on the ordinary client socket
 438  * wait queue. This connection is established whenever a write (or
 439  * poll for write) hit the flow control condition and broken when the
 440  * association to the server socket is dissolved or after a wake up
 441  * was relayed.
 442  */
 443
 444 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 445                                       void *key)
 446 {
 447         struct unix_sock *u;
 448         wait_queue_head_t *u_sleep;
 449
 450         u = container_of(q, struct unix_sock, peer_wake);
 451
 452         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 453                             q);
 454         u->peer_wake.private = NULL;
 455
 456         /* relaying can only happen while the wq still exists */
 457         u_sleep = sk_sleep(&u->sk);
 458         if (u_sleep)
 459                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 460
 461         return 0;
 462 }
 463
 464 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 465 {
 466         struct unix_sock *u, *u_other;
 467         int rc;
 468
 469         u = unix_sk(sk);
 470         u_other = unix_sk(other);
 471         rc = 0;
 472         spin_lock(&u_other->peer_wait.lock);
 473
 474         if (!u->peer_wake.private) {
 475                 u->peer_wake.private = other;
 476                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 477
 478                 rc = 1;
 479         }
 480
 481         spin_unlock(&u_other->peer_wait.lock);
 482         return rc;
 483 }
 484
 485 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 486                                             struct sock *other)
 487 {
 488         struct unix_sock *u, *u_other;
 489
 490         u = unix_sk(sk);
 491         u_other = unix_sk(other);
 492         spin_lock(&u_other->peer_wait.lock);
 493
 494         if (u->peer_wake.private == other) {
 495                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 496                 u->peer_wake.private = NULL;
 497         }
 498
 499         spin_unlock(&u_other->peer_wait.lock);
 500 }
 501
 502 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 503                                                    struct sock *other)
 504 {
 505         unix_dgram_peer_wake_disconnect(sk, other);
 506         wake_up_interruptible_poll(sk_sleep(sk),
 507                                    EPOLLOUT |
 508                                    EPOLLWRNORM |
 509                                    EPOLLWRBAND);
 510 }
 511
 512 /* preconditions:
 513  *      - unix_peer(sk) == other
 514  *      - association is stable
 515  */
 516 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 517 {
 518         int connected;
 519
 520         connected = unix_dgram_peer_wake_connect(sk, other);
 521
 522         /* If other is SOCK_DEAD, we want to make sure we signal
 523          * POLLOUT, such that a subsequent write() can get a
 524          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 525          * to other and its full, we will hang waiting for POLLOUT.
 526          */
 527         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 528                 return 1;
 529
 530         if (connected)
 531                 unix_dgram_peer_wake_disconnect(sk, other);
 532
 533         return 0;
 534 }
 535
 536 static int unix_writable(const struct sock *sk)
 537 {
 538         return sk->sk_state != TCP_LISTEN &&
 539                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 540 }
 541
 542 static void unix_write_space(struct sock *sk)
 543 {
 544         struct socket_wq *wq;
 545
 546         rcu_read_lock();
 547         if (unix_writable(sk)) {
 548                 wq = rcu_dereference(sk->sk_wq);
 549                 if (skwq_has_sleeper(wq))
 550                         wake_up_interruptible_sync_poll(&wq->wait,
 551                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 552                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 553         }
 554         rcu_read_unlock();
 555 }
 556
 557 /* When dgram socket disconnects (or changes its peer), we clear its receive
 558  * queue of packets arrived from previous peer. First, it allows to do
 559  * flow control based only on wmem_alloc; second, sk connected to peer
 560  * may receive messages only from that peer. */
 561 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 562 {
 563         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 564                 skb_queue_purge(&sk->sk_receive_queue);
 565                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 566
 567                 /* If one link of bidirectional dgram pipe is disconnected,
 568                  * we signal error. Messages are lost. Do not make this,
 569                  * when peer was not connected to us.
 570                  */
 571                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 572                         WRITE_ONCE(other->sk_err, ECONNRESET);
 573                         sk_error_report(other);
 574                 }
 575         }
 576         other->sk_state = TCP_CLOSE;
 577 }
 578
 579 static void unix_sock_destructor(struct sock *sk)
 580 {
 581         struct unix_sock *u = unix_sk(sk);
 582
 583         skb_queue_purge(&sk->sk_receive_queue);
 584
 585         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 586         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 587         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 588         if (!sock_flag(sk, SOCK_DEAD)) {
 589                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 590                 return;
 591         }
 592
 593         if (u->addr)
 594                 unix_release_addr(u->addr);
 595
 596         atomic_long_dec(&unix_nr_socks);
 597         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 598 #ifdef UNIX_REFCNT_DEBUG
 599         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 600                 atomic_long_read(&unix_nr_socks));
 601 #endif
 602 }
 603
 604 static void unix_release_sock(struct sock *sk, int embrion)
 605 {
 606         struct unix_sock *u = unix_sk(sk);
 607         struct sock *skpair;
 608         struct sk_buff *skb;
 609         struct path path;
 610         int state;
 611
 612         unix_remove_socket(sock_net(sk), sk);
 613         unix_remove_bsd_socket(sk);
 614
 615         /* Clear state */
 616         unix_state_lock(sk);
 617         sock_orphan(sk);
 618         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
 619         path         = u->path;
 620         u->path.dentry = NULL;
 621         u->path.mnt = NULL;
 622         state = sk->sk_state;
 623         sk->sk_state = TCP_CLOSE;
 624
 625         skpair = unix_peer(sk);
 626         unix_peer(sk) = NULL;
 627
 628         unix_state_unlock(sk);
 629
 630 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 631         if (u->oob_skb) {
 632                 kfree_skb(u->oob_skb);
 633                 u->oob_skb = NULL;
 634         }
 635 #endif
 636
 637         wake_up_interruptible_all(&u->peer_wait);
 638
 639         if (skpair != NULL) {
 640                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 641                         unix_state_lock(skpair);
 642                         /* No more writes */
 643                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
 644                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 645                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
 646                         unix_state_unlock(skpair);
 647                         skpair->sk_state_change(skpair);
 648                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 649                 }
 650
 651                 unix_dgram_peer_wake_disconnect(sk, skpair);
 652                 sock_put(skpair); /* It may now die */
 653         }
 654
 655         /* Try to flush out this socket. Throw out buffers at least */
 656
 657         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 658                 if (state == TCP_LISTEN)
 659                         unix_release_sock(skb->sk, 1);
 660                 /* passed fds are erased in the kfree_skb hook        */
 661                 UNIXCB(skb).consumed = skb->len;
 662                 kfree_skb(skb);
 663         }
 664
 665         if (path.dentry)
 666                 path_put(&path);
 667
 668         sock_put(sk);
 669
 670         /* ---- Socket is dead now and most probably destroyed ---- */
 671
 672         /*
 673          * Fixme: BSD difference: In BSD all sockets connected to us get
 674          *        ECONNRESET and we die on the spot. In Linux we behave
 675          *        like files and pipes do and wait for the last
 676          *        dereference.
 677          *
 678          * Can't we simply set sock->err?
 679          *
 680          *        What the above comment does talk about? --ANK(980817)
 681          */
 682
 683         if (unix_tot_inflight)
 684                 unix_gc();              /* Garbage collect fds */
 685 }
 686
 687 static void init_peercred(struct sock *sk)
 688 {
 689         const struct cred *old_cred;
 690         struct pid *old_pid;
 691
 692         spin_lock(&sk->sk_peer_lock);
 693         old_pid = sk->sk_peer_pid;
 694         old_cred = sk->sk_peer_cred;
 695         sk->sk_peer_pid  = get_pid(task_tgid(current));
 696         sk->sk_peer_cred = get_current_cred();
 697         spin_unlock(&sk->sk_peer_lock);
 698
 699         put_pid(old_pid);
 700         put_cred(old_cred);
 701 }
 702
 703 static void copy_peercred(struct sock *sk, struct sock *peersk)
 704 {
 705         const struct cred *old_cred;
 706         struct pid *old_pid;
 707
 708         if (sk < peersk) {
 709                 spin_lock(&sk->sk_peer_lock);
 710                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 711         } else {
 712                 spin_lock(&peersk->sk_peer_lock);
 713                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 714         }
 715         old_pid = sk->sk_peer_pid;
 716         old_cred = sk->sk_peer_cred;
 717         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 718         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 719
 720         spin_unlock(&sk->sk_peer_lock);
 721         spin_unlock(&peersk->sk_peer_lock);
 722
 723         put_pid(old_pid);
 724         put_cred(old_cred);
 725 }
 726
 727 static int unix_listen(struct socket *sock, int backlog)
 728 {
 729         int err;
 730         struct sock *sk = sock->sk;
 731         struct unix_sock *u = unix_sk(sk);
 732
 733         err = -EOPNOTSUPP;
 734         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 735                 goto out;       /* Only stream/seqpacket sockets accept */
 736         err = -EINVAL;
 737         if (!u->addr)
 738                 goto out;       /* No listens on an unbound socket */
 739         unix_state_lock(sk);
 740         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 741                 goto out_unlock;
 742         if (backlog > sk->sk_max_ack_backlog)
 743                 wake_up_interruptible_all(&u->peer_wait);
 744         sk->sk_max_ack_backlog  = backlog;
 745         sk->sk_state            = TCP_LISTEN;
 746         /* set credentials so connect can copy them */
 747         init_peercred(sk);
 748         err = 0;
 749
 750 out_unlock:
 751         unix_state_unlock(sk);
 752 out:
 753         return err;
 754 }
 755
 756 static int unix_release(struct socket *);
 757 static int unix_bind(struct socket *, struct sockaddr *, int);
 758 static int unix_stream_connect(struct socket *, struct sockaddr *,
 759                                int addr_len, int flags);
 760 static int unix_socketpair(struct socket *, struct socket *);
 761 static int unix_accept(struct socket *, struct socket *, int, bool);
 762 static int unix_getname(struct socket *, struct sockaddr *, int);
 763 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 764 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 765                                     poll_table *);
 766 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 767 #ifdef CONFIG_COMPAT
 768 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 769 #endif
 770 static int unix_shutdown(struct socket *, int);
 771 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 772 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 773 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 774                                        struct pipe_inode_info *, size_t size,
 775                                        unsigned int flags);
 776 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 777 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 778 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 779 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 780 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 781                               int, int);
 782 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 783 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 784                                   int);
 785
 786 static int unix_set_peek_off(struct sock *sk, int val)
 787 {
 788         struct unix_sock *u = unix_sk(sk);
 789
 790         if (mutex_lock_interruptible(&u->iolock))
 791                 return -EINTR;
 792
 793         WRITE_ONCE(sk->sk_peek_off, val);
 794         mutex_unlock(&u->iolock);
 795
 796         return 0;
 797 }
 798
 799 #ifdef CONFIG_PROC_FS
 800 static int unix_count_nr_fds(struct sock *sk)
 801 {
 802         struct sk_buff *skb;
 803         struct unix_sock *u;
 804         int nr_fds = 0;
 805
 806         spin_lock(&sk->sk_receive_queue.lock);
 807         skb = skb_peek(&sk->sk_receive_queue);
 808         while (skb) {
 809                 u = unix_sk(skb->sk);
 810                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 811                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 812         }
 813         spin_unlock(&sk->sk_receive_queue.lock);
 814
 815         return nr_fds;
 816 }
 817
 818 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 819 {
 820         struct sock *sk = sock->sk;
 821         unsigned char s_state;
 822         struct unix_sock *u;
 823         int nr_fds = 0;
 824
 825         if (sk) {
 826                 s_state = READ_ONCE(sk->sk_state);
 827                 u = unix_sk(sk);
 828
 829                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
 830                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
 831                  * SOCK_DGRAM is ordinary. So, no lock is needed.
 832                  */
 833                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 834                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 835                 else if (s_state == TCP_LISTEN)
 836                         nr_fds = unix_count_nr_fds(sk);
 837
 838                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 839         }
 840 }
 841 #else
 842 #define unix_show_fdinfo NULL
 843 #endif
 844
 845 static const struct proto_ops unix_stream_ops = {
 846         .family =       PF_UNIX,
 847         .owner =        THIS_MODULE,
 848         .release =      unix_release,
 849         .bind =         unix_bind,
 850         .connect =      unix_stream_connect,
 851         .socketpair =   unix_socketpair,
 852         .accept =       unix_accept,
 853         .getname =      unix_getname,
 854         .poll =         unix_poll,
 855         .ioctl =        unix_ioctl,
 856 #ifdef CONFIG_COMPAT
 857         .compat_ioctl = unix_compat_ioctl,
 858 #endif
 859         .listen =       unix_listen,
 860         .shutdown =     unix_shutdown,
 861         .sendmsg =      unix_stream_sendmsg,
 862         .recvmsg =      unix_stream_recvmsg,
 863         .read_skb =     unix_stream_read_skb,
 864         .mmap =         sock_no_mmap,
 865         .splice_read =  unix_stream_splice_read,
 866         .set_peek_off = unix_set_peek_off,
 867         .show_fdinfo =  unix_show_fdinfo,
 868 };
 869
 870 static const struct proto_ops unix_dgram_ops = {
 871         .family =       PF_UNIX,
 872         .owner =        THIS_MODULE,
 873         .release =      unix_release,
 874         .bind =         unix_bind,
 875         .connect =      unix_dgram_connect,
 876         .socketpair =   unix_socketpair,
 877         .accept =       sock_no_accept,
 878         .getname =      unix_getname,
 879         .poll =         unix_dgram_poll,
 880         .ioctl =        unix_ioctl,
 881 #ifdef CONFIG_COMPAT
 882         .compat_ioctl = unix_compat_ioctl,
 883 #endif
 884         .listen =       sock_no_listen,
 885         .shutdown =     unix_shutdown,
 886         .sendmsg =      unix_dgram_sendmsg,
 887         .read_skb =     unix_read_skb,
 888         .recvmsg =      unix_dgram_recvmsg,
 889         .mmap =         sock_no_mmap,
 890         .set_peek_off = unix_set_peek_off,
 891         .show_fdinfo =  unix_show_fdinfo,
 892 };
 893
 894 static const struct proto_ops unix_seqpacket_ops = {
 895         .family =       PF_UNIX,
 896         .owner =        THIS_MODULE,
 897         .release =      unix_release,
 898         .bind =         unix_bind,
 899         .connect =      unix_stream_connect,
 900         .socketpair =   unix_socketpair,
 901         .accept =       unix_accept,
 902         .getname =      unix_getname,
 903         .poll =         unix_dgram_poll,
 904         .ioctl =        unix_ioctl,
 905 #ifdef CONFIG_COMPAT
 906         .compat_ioctl = unix_compat_ioctl,
 907 #endif
 908         .listen =       unix_listen,
 909         .shutdown =     unix_shutdown,
 910         .sendmsg =      unix_seqpacket_sendmsg,
 911         .recvmsg =      unix_seqpacket_recvmsg,
 912         .mmap =         sock_no_mmap,
 913         .set_peek_off = unix_set_peek_off,
 914         .show_fdinfo =  unix_show_fdinfo,
 915 };
 916
 917 static void unix_close(struct sock *sk, long timeout)
 918 {
 919         /* Nothing to do here, unix socket does not need a ->close().
 920          * This is merely for sockmap.
 921          */
 922 }
 923
 924 static void unix_unhash(struct sock *sk)
 925 {
 926         /* Nothing to do here, unix socket does not need a ->unhash().
 927          * This is merely for sockmap.
 928          */
 929 }
 930
 931 static bool unix_bpf_bypass_getsockopt(int level, int optname)
 932 {
 933         if (level == SOL_SOCKET) {
 934                 switch (optname) {
 935                 case SO_PEERPIDFD:
 936                         return true;
 937                 default:
 938                         return false;
 939                 }
 940         }
 941
 942         return false;
 943 }
 944
 945 struct proto unix_dgram_proto = {
 946         .name                   = "UNIX",
 947         .owner                  = THIS_MODULE,
 948         .obj_size               = sizeof(struct unix_sock),
 949         .close                  = unix_close,
 950         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 951 #ifdef CONFIG_BPF_SYSCALL
 952         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 953 #endif
 954 };
 955
 956 struct proto unix_stream_proto = {
 957         .name                   = "UNIX-STREAM",
 958         .owner                  = THIS_MODULE,
 959         .obj_size               = sizeof(struct unix_sock),
 960         .close                  = unix_close,
 961         .unhash                 = unix_unhash,
 962         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 963 #ifdef CONFIG_BPF_SYSCALL
 964         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
 965 #endif
 966 };
 967
 968 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 969 {
 970         struct unix_sock *u;
 971         struct sock *sk;
 972         int err;
 973
 974         atomic_long_inc(&unix_nr_socks);
 975         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
 976                 err = -ENFILE;
 977                 goto err;
 978         }
 979
 980         if (type == SOCK_STREAM)
 981                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
 982         else /*dgram and  seqpacket */
 983                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
 984
 985         if (!sk) {
 986                 err = -ENOMEM;
 987                 goto err;
 988         }
 989
 990         sock_init_data(sock, sk);
 991
 992         sk->sk_hash             = unix_unbound_hash(sk);
 993         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 994         sk->sk_write_space      = unix_write_space;
 995         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 996         sk->sk_destruct         = unix_sock_destructor;
 997         u         = unix_sk(sk);
 998         u->path.dentry = NULL;
 999         u->path.mnt = NULL;
1000         spin_lock_init(&u->lock);
1001         atomic_long_set(&u->inflight, 0);
1002         INIT_LIST_HEAD(&u->link);
1003         mutex_init(&u->iolock); /* single task reading lock */
1004         mutex_init(&u->bindlock); /* single task binding lock */
1005         init_waitqueue_head(&u->peer_wait);
1006         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1007         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1008         unix_insert_unbound_socket(net, sk);
1009
1010         sock_prot_inuse_add(net, sk->sk_prot, 1);
1011
1012         return sk;
1013
1014 err:
1015         atomic_long_dec(&unix_nr_socks);
1016         return ERR_PTR(err);
1017 }
1018
1019 static int unix_create(struct net *net, struct socket *sock, int protocol,
1020                        int kern)
1021 {
1022         struct sock *sk;
1023
1024         if (protocol && protocol != PF_UNIX)
1025                 return -EPROTONOSUPPORT;
1026
1027         sock->state = SS_UNCONNECTED;
1028
1029         switch (sock->type) {
1030         case SOCK_STREAM:
1031                 sock->ops = &unix_stream_ops;
1032                 break;
1033                 /*
1034                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1035                  *      nothing uses it.
1036                  */
1037         case SOCK_RAW:
1038                 sock->type = SOCK_DGRAM;
1039                 fallthrough;
1040         case SOCK_DGRAM:
1041                 sock->ops = &unix_dgram_ops;
1042                 break;
1043         case SOCK_SEQPACKET:
1044                 sock->ops = &unix_seqpacket_ops;
1045                 break;
1046         default:
1047                 return -ESOCKTNOSUPPORT;
1048         }
1049
1050         sk = unix_create1(net, sock, kern, sock->type);
1051         if (IS_ERR(sk))
1052                 return PTR_ERR(sk);
1053
1054         return 0;
1055 }
1056
1057 static int unix_release(struct socket *sock)
1058 {
1059         struct sock *sk = sock->sk;
1060
1061         if (!sk)
1062                 return 0;
1063
1064         sk->sk_prot->close(sk, 0);
1065         unix_release_sock(sk, 0);
1066         sock->sk = NULL;
1067
1068         return 0;
1069 }
1070
1071 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1072                                   int type)
1073 {
1074         struct inode *inode;
1075         struct path path;
1076         struct sock *sk;
1077         int err;
1078
1079         unix_mkname_bsd(sunaddr, addr_len);
1080         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1081         if (err)
1082                 goto fail;
1083
1084         err = path_permission(&path, MAY_WRITE);
1085         if (err)
1086                 goto path_put;
1087
1088         err = -ECONNREFUSED;
1089         inode = d_backing_inode(path.dentry);
1090         if (!S_ISSOCK(inode->i_mode))
1091                 goto path_put;
1092
1093         sk = unix_find_socket_byinode(inode);
1094         if (!sk)
1095                 goto path_put;
1096
1097         err = -EPROTOTYPE;
1098         if (sk->sk_type == type)
1099                 touch_atime(&path);
1100         else
1101                 goto sock_put;
1102
1103         path_put(&path);
1104
1105         return sk;
1106
1107 sock_put:
1108         sock_put(sk);
1109 path_put:
1110         path_put(&path);
1111 fail:
1112         return ERR_PTR(err);
1113 }
1114
1115 static struct sock *unix_find_abstract(struct net *net,
1116                                        struct sockaddr_un *sunaddr,
1117                                        int addr_len, int type)
1118 {
1119         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1120         struct dentry *dentry;
1121         struct sock *sk;
1122
1123         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1124         if (!sk)
1125                 return ERR_PTR(-ECONNREFUSED);
1126
1127         dentry = unix_sk(sk)->path.dentry;
1128         if (dentry)
1129                 touch_atime(&unix_sk(sk)->path);
1130
1131         return sk;
1132 }
1133
1134 static struct sock *unix_find_other(struct net *net,
1135                                     struct sockaddr_un *sunaddr,
1136                                     int addr_len, int type)
1137 {
1138         struct sock *sk;
1139
1140         if (sunaddr->sun_path[0])
1141                 sk = unix_find_bsd(sunaddr, addr_len, type);
1142         else
1143                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1144
1145         return sk;
1146 }
1147
1148 static int unix_autobind(struct sock *sk)
1149 {
1150         unsigned int new_hash, old_hash = sk->sk_hash;
1151         struct unix_sock *u = unix_sk(sk);
1152         struct net *net = sock_net(sk);
1153         struct unix_address *addr;
1154         u32 lastnum, ordernum;
1155         int err;
1156
1157         err = mutex_lock_interruptible(&u->bindlock);
1158         if (err)
1159                 return err;
1160
1161         if (u->addr)
1162                 goto out;
1163
1164         err = -ENOMEM;
1165         addr = kzalloc(sizeof(*addr) +
1166                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1167         if (!addr)
1168                 goto out;
1169
1170         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1171         addr->name->sun_family = AF_UNIX;
1172         refcount_set(&addr->refcnt, 1);
1173
1174         ordernum = get_random_u32();
1175         lastnum = ordernum & 0xFFFFF;
1176 retry:
1177         ordernum = (ordernum + 1) & 0xFFFFF;
1178         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1179
1180         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1181         unix_table_double_lock(net, old_hash, new_hash);
1182
1183         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1184                 unix_table_double_unlock(net, old_hash, new_hash);
1185
1186                 /* __unix_find_socket_byname() may take long time if many names
1187                  * are already in use.
1188                  */
1189                 cond_resched();
1190
1191                 if (ordernum == lastnum) {
1192                         /* Give up if all names seems to be in use. */
1193                         err = -ENOSPC;
1194                         unix_release_addr(addr);
1195                         goto out;
1196                 }
1197
1198                 goto retry;
1199         }
1200
1201         __unix_set_addr_hash(net, sk, addr, new_hash);
1202         unix_table_double_unlock(net, old_hash, new_hash);
1203         err = 0;
1204
1205 out:    mutex_unlock(&u->bindlock);
1206         return err;
1207 }
1208
1209 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1210                          int addr_len)
1211 {
1212         umode_t mode = S_IFSOCK |
1213                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1214         unsigned int new_hash, old_hash = sk->sk_hash;
1215         struct unix_sock *u = unix_sk(sk);
1216         struct net *net = sock_net(sk);
1217         struct mnt_idmap *idmap;
1218         struct unix_address *addr;
1219         struct dentry *dentry;
1220         struct path parent;
1221         int err;
1222
1223         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1224         addr = unix_create_addr(sunaddr, addr_len);
1225         if (!addr)
1226                 return -ENOMEM;
1227
1228         /*
1229          * Get the parent directory, calculate the hash for last
1230          * component.
1231          */
1232         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1233         if (IS_ERR(dentry)) {
1234                 err = PTR_ERR(dentry);
1235                 goto out;
1236         }
1237
1238         /*
1239          * All right, let's create it.
1240          */
1241         idmap = mnt_idmap(parent.mnt);
1242         err = security_path_mknod(&parent, dentry, mode, 0);
1243         if (!err)
1244                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1245         if (err)
1246                 goto out_path;
1247         err = mutex_lock_interruptible(&u->bindlock);
1248         if (err)
1249                 goto out_unlink;
1250         if (u->addr)
1251                 goto out_unlock;
1252
1253         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1254         unix_table_double_lock(net, old_hash, new_hash);
1255         u->path.mnt = mntget(parent.mnt);
1256         u->path.dentry = dget(dentry);
1257         __unix_set_addr_hash(net, sk, addr, new_hash);
1258         unix_table_double_unlock(net, old_hash, new_hash);
1259         unix_insert_bsd_socket(sk);
1260         mutex_unlock(&u->bindlock);
1261         done_path_create(&parent, dentry);
1262         return 0;
1263
1264 out_unlock:
1265         mutex_unlock(&u->bindlock);
1266         err = -EINVAL;
1267 out_unlink:
1268         /* failed after successful mknod?  unlink what we'd created... */
1269         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1270 out_path:
1271         done_path_create(&parent, dentry);
1272 out:
1273         unix_release_addr(addr);
1274         return err == -EEXIST ? -EADDRINUSE : err;
1275 }
1276
1277 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1278                               int addr_len)
1279 {
1280         unsigned int new_hash, old_hash = sk->sk_hash;
1281         struct unix_sock *u = unix_sk(sk);
1282         struct net *net = sock_net(sk);
1283         struct unix_address *addr;
1284         int err;
1285
1286         addr = unix_create_addr(sunaddr, addr_len);
1287         if (!addr)
1288                 return -ENOMEM;
1289
1290         err = mutex_lock_interruptible(&u->bindlock);
1291         if (err)
1292                 goto out;
1293
1294         if (u->addr) {
1295                 err = -EINVAL;
1296                 goto out_mutex;
1297         }
1298
1299         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1300         unix_table_double_lock(net, old_hash, new_hash);
1301
1302         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1303                 goto out_spin;
1304
1305         __unix_set_addr_hash(net, sk, addr, new_hash);
1306         unix_table_double_unlock(net, old_hash, new_hash);
1307         mutex_unlock(&u->bindlock);
1308         return 0;
1309
1310 out_spin:
1311         unix_table_double_unlock(net, old_hash, new_hash);
1312         err = -EADDRINUSE;
1313 out_mutex:
1314         mutex_unlock(&u->bindlock);
1315 out:
1316         unix_release_addr(addr);
1317         return err;
1318 }
1319
1320 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1321 {
1322         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1323         struct sock *sk = sock->sk;
1324         int err;
1325
1326         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1327             sunaddr->sun_family == AF_UNIX)
1328                 return unix_autobind(sk);
1329
1330         err = unix_validate_addr(sunaddr, addr_len);
1331         if (err)
1332                 return err;
1333
1334         if (sunaddr->sun_path[0])
1335                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1336         else
1337                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1338
1339         return err;
1340 }
1341
1342 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1343 {
1344         if (unlikely(sk1 == sk2) || !sk2) {
1345                 unix_state_lock(sk1);
1346                 return;
1347         }
1348         if (sk1 < sk2) {
1349                 unix_state_lock(sk1);
1350                 unix_state_lock_nested(sk2);
1351         } else {
1352                 unix_state_lock(sk2);
1353                 unix_state_lock_nested(sk1);
1354         }
1355 }
1356
1357 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1358 {
1359         if (unlikely(sk1 == sk2) || !sk2) {
1360                 unix_state_unlock(sk1);
1361                 return;
1362         }
1363         unix_state_unlock(sk1);
1364         unix_state_unlock(sk2);
1365 }
1366
1367 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1368                               int alen, int flags)
1369 {
1370         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1371         struct sock *sk = sock->sk;
1372         struct sock *other;
1373         int err;
1374
1375         err = -EINVAL;
1376         if (alen < offsetofend(struct sockaddr, sa_family))
1377                 goto out;
1378
1379         if (addr->sa_family != AF_UNSPEC) {
1380                 err = unix_validate_addr(sunaddr, alen);
1381                 if (err)
1382                         goto out;
1383
1384                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1385                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1386                     !unix_sk(sk)->addr) {
1387                         err = unix_autobind(sk);
1388                         if (err)
1389                                 goto out;
1390                 }
1391
1392 restart:
1393                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1394                 if (IS_ERR(other)) {
1395                         err = PTR_ERR(other);
1396                         goto out;
1397                 }
1398
1399                 unix_state_double_lock(sk, other);
1400
1401                 /* Apparently VFS overslept socket death. Retry. */
1402                 if (sock_flag(other, SOCK_DEAD)) {
1403                         unix_state_double_unlock(sk, other);
1404                         sock_put(other);
1405                         goto restart;
1406                 }
1407
1408                 err = -EPERM;
1409                 if (!unix_may_send(sk, other))
1410                         goto out_unlock;
1411
1412                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1413                 if (err)
1414                         goto out_unlock;
1415
1416                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1417         } else {
1418                 /*
1419                  *      1003.1g breaking connected state with AF_UNSPEC
1420                  */
1421                 other = NULL;
1422                 unix_state_double_lock(sk, other);
1423         }
1424
1425         /*
1426          * If it was connected, reconnect.
1427          */
1428         if (unix_peer(sk)) {
1429                 struct sock *old_peer = unix_peer(sk);
1430
1431                 unix_peer(sk) = other;
1432                 if (!other)
1433                         sk->sk_state = TCP_CLOSE;
1434                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1435
1436                 unix_state_double_unlock(sk, other);
1437
1438                 if (other != old_peer)
1439                         unix_dgram_disconnected(sk, old_peer);
1440                 sock_put(old_peer);
1441         } else {
1442                 unix_peer(sk) = other;
1443                 unix_state_double_unlock(sk, other);
1444         }
1445
1446         return 0;
1447
1448 out_unlock:
1449         unix_state_double_unlock(sk, other);
1450         sock_put(other);
1451 out:
1452         return err;
1453 }
1454
1455 static long unix_wait_for_peer(struct sock *other, long timeo)
1456         __releases(&unix_sk(other)->lock)
1457 {
1458         struct unix_sock *u = unix_sk(other);
1459         int sched;
1460         DEFINE_WAIT(wait);
1461
1462         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1463
1464         sched = !sock_flag(other, SOCK_DEAD) &&
1465                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1466                 unix_recvq_full_lockless(other);
1467
1468         unix_state_unlock(other);
1469
1470         if (sched)
1471                 timeo = schedule_timeout(timeo);
1472
1473         finish_wait(&u->peer_wait, &wait);
1474         return timeo;
1475 }
1476
1477 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1478                                int addr_len, int flags)
1479 {
1480         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1481         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1482         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1483         struct net *net = sock_net(sk);
1484         struct sk_buff *skb = NULL;
1485         long timeo;
1486         int err;
1487         int st;
1488
1489         err = unix_validate_addr(sunaddr, addr_len);
1490         if (err)
1491                 goto out;
1492
1493         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1494              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1495                 err = unix_autobind(sk);
1496                 if (err)
1497                         goto out;
1498         }
1499
1500         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1501
1502         /* First of all allocate resources.
1503            If we will make it after state is locked,
1504            we will have to recheck all again in any case.
1505          */
1506
1507         /* create new sock for complete connection */
1508         newsk = unix_create1(net, NULL, 0, sock->type);
1509         if (IS_ERR(newsk)) {
1510                 err = PTR_ERR(newsk);
1511                 newsk = NULL;
1512                 goto out;
1513         }
1514
1515         err = -ENOMEM;
1516
1517         /* Allocate skb for sending to listening sock */
1518         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1519         if (skb == NULL)
1520                 goto out;
1521
1522 restart:
1523         /*  Find listening sock. */
1524         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1525         if (IS_ERR(other)) {
1526                 err = PTR_ERR(other);
1527                 other = NULL;
1528                 goto out;
1529         }
1530
1531         /* Latch state of peer */
1532         unix_state_lock(other);
1533
1534         /* Apparently VFS overslept socket death. Retry. */
1535         if (sock_flag(other, SOCK_DEAD)) {
1536                 unix_state_unlock(other);
1537                 sock_put(other);
1538                 goto restart;
1539         }
1540
1541         err = -ECONNREFUSED;
1542         if (other->sk_state != TCP_LISTEN)
1543                 goto out_unlock;
1544         if (other->sk_shutdown & RCV_SHUTDOWN)
1545                 goto out_unlock;
1546
1547         if (unix_recvq_full(other)) {
1548                 err = -EAGAIN;
1549                 if (!timeo)
1550                         goto out_unlock;
1551
1552                 timeo = unix_wait_for_peer(other, timeo);
1553
1554                 err = sock_intr_errno(timeo);
1555                 if (signal_pending(current))
1556                         goto out;
1557                 sock_put(other);
1558                 goto restart;
1559         }
1560
1561         /* Latch our state.
1562
1563            It is tricky place. We need to grab our state lock and cannot
1564            drop lock on peer. It is dangerous because deadlock is
1565            possible. Connect to self case and simultaneous
1566            attempt to connect are eliminated by checking socket
1567            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1568            check this before attempt to grab lock.
1569
1570            Well, and we have to recheck the state after socket locked.
1571          */
1572         st = sk->sk_state;
1573
1574         switch (st) {
1575         case TCP_CLOSE:
1576                 /* This is ok... continue with connect */
1577                 break;
1578         case TCP_ESTABLISHED:
1579                 /* Socket is already connected */
1580                 err = -EISCONN;
1581                 goto out_unlock;
1582         default:
1583                 err = -EINVAL;
1584                 goto out_unlock;
1585         }
1586
1587         unix_state_lock_nested(sk);
1588
1589         if (sk->sk_state != st) {
1590                 unix_state_unlock(sk);
1591                 unix_state_unlock(other);
1592                 sock_put(other);
1593                 goto restart;
1594         }
1595
1596         err = security_unix_stream_connect(sk, other, newsk);
1597         if (err) {
1598                 unix_state_unlock(sk);
1599                 goto out_unlock;
1600         }
1601
1602         /* The way is open! Fastly set all the necessary fields... */
1603
1604         sock_hold(sk);
1605         unix_peer(newsk)        = sk;
1606         newsk->sk_state         = TCP_ESTABLISHED;
1607         newsk->sk_type          = sk->sk_type;
1608         init_peercred(newsk);
1609         newu = unix_sk(newsk);
1610         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1611         otheru = unix_sk(other);
1612
1613         /* copy address information from listening to new sock
1614          *
1615          * The contents of *(otheru->addr) and otheru->path
1616          * are seen fully set up here, since we have found
1617          * otheru in hash under its lock.  Insertion into the
1618          * hash chain we'd found it in had been done in an
1619          * earlier critical area protected by the chain's lock,
1620          * the same one where we'd set *(otheru->addr) contents,
1621          * as well as otheru->path and otheru->addr itself.
1622          *
1623          * Using smp_store_release() here to set newu->addr
1624          * is enough to make those stores, as well as stores
1625          * to newu->path visible to anyone who gets newu->addr
1626          * by smp_load_acquire().  IOW, the same warranties
1627          * as for unix_sock instances bound in unix_bind() or
1628          * in unix_autobind().
1629          */
1630         if (otheru->path.dentry) {
1631                 path_get(&otheru->path);
1632                 newu->path = otheru->path;
1633         }
1634         refcount_inc(&otheru->addr->refcnt);
1635         smp_store_release(&newu->addr, otheru->addr);
1636
1637         /* Set credentials */
1638         copy_peercred(sk, other);
1639
1640         sock->state     = SS_CONNECTED;
1641         sk->sk_state    = TCP_ESTABLISHED;
1642         sock_hold(newsk);
1643
1644         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1645         unix_peer(sk)   = newsk;
1646
1647         unix_state_unlock(sk);
1648
1649         /* take ten and send info to listening sock */
1650         spin_lock(&other->sk_receive_queue.lock);
1651         __skb_queue_tail(&other->sk_receive_queue, skb);
1652         spin_unlock(&other->sk_receive_queue.lock);
1653         unix_state_unlock(other);
1654         other->sk_data_ready(other);
1655         sock_put(other);
1656         return 0;
1657
1658 out_unlock:
1659         if (other)
1660                 unix_state_unlock(other);
1661
1662 out:
1663         kfree_skb(skb);
1664         if (newsk)
1665                 unix_release_sock(newsk, 0);
1666         if (other)
1667                 sock_put(other);
1668         return err;
1669 }
1670
1671 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1672 {
1673         struct sock *ska = socka->sk, *skb = sockb->sk;
1674
1675         /* Join our sockets back to back */
1676         sock_hold(ska);
1677         sock_hold(skb);
1678         unix_peer(ska) = skb;
1679         unix_peer(skb) = ska;
1680         init_peercred(ska);
1681         init_peercred(skb);
1682
1683         ska->sk_state = TCP_ESTABLISHED;
1684         skb->sk_state = TCP_ESTABLISHED;
1685         socka->state  = SS_CONNECTED;
1686         sockb->state  = SS_CONNECTED;
1687         return 0;
1688 }
1689
1690 static void unix_sock_inherit_flags(const struct socket *old,
1691                                     struct socket *new)
1692 {
1693         if (test_bit(SOCK_PASSCRED, &old->flags))
1694                 set_bit(SOCK_PASSCRED, &new->flags);
1695         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1696                 set_bit(SOCK_PASSPIDFD, &new->flags);
1697         if (test_bit(SOCK_PASSSEC, &old->flags))
1698                 set_bit(SOCK_PASSSEC, &new->flags);
1699 }
1700
1701 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1702                        bool kern)
1703 {
1704         struct sock *sk = sock->sk;
1705         struct sock *tsk;
1706         struct sk_buff *skb;
1707         int err;
1708
1709         err = -EOPNOTSUPP;
1710         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1711                 goto out;
1712
1713         err = -EINVAL;
1714         if (sk->sk_state != TCP_LISTEN)
1715                 goto out;
1716
1717         /* If socket state is TCP_LISTEN it cannot change (for now...),
1718          * so that no locks are necessary.
1719          */
1720
1721         skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1722                                 &err);
1723         if (!skb) {
1724                 /* This means receive shutdown. */
1725                 if (err == 0)
1726                         err = -EINVAL;
1727                 goto out;
1728         }
1729
1730         tsk = skb->sk;
1731         skb_free_datagram(sk, skb);
1732         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1733
1734         /* attach accepted sock to socket */
1735         unix_state_lock(tsk);
1736         newsock->state = SS_CONNECTED;
1737         unix_sock_inherit_flags(sock, newsock);
1738         sock_graft(tsk, newsock);
1739         unix_state_unlock(tsk);
1740         return 0;
1741
1742 out:
1743         return err;
1744 }
1745
1746
1747 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1748 {
1749         struct sock *sk = sock->sk;
1750         struct unix_address *addr;
1751         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1752         int err = 0;
1753
1754         if (peer) {
1755                 sk = unix_peer_get(sk);
1756
1757                 err = -ENOTCONN;
1758                 if (!sk)
1759                         goto out;
1760                 err = 0;
1761         } else {
1762                 sock_hold(sk);
1763         }
1764
1765         addr = smp_load_acquire(&unix_sk(sk)->addr);
1766         if (!addr) {
1767                 sunaddr->sun_family = AF_UNIX;
1768                 sunaddr->sun_path[0] = 0;
1769                 err = offsetof(struct sockaddr_un, sun_path);
1770         } else {
1771                 err = addr->len;
1772                 memcpy(sunaddr, addr->name, addr->len);
1773         }
1774         sock_put(sk);
1775 out:
1776         return err;
1777 }
1778
1779 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1780 {
1781         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1782
1783         /*
1784          * Garbage collection of unix sockets starts by selecting a set of
1785          * candidate sockets which have reference only from being in flight
1786          * (total_refs == inflight_refs).  This condition is checked once during
1787          * the candidate collection phase, and candidates are marked as such, so
1788          * that non-candidates can later be ignored.  While inflight_refs is
1789          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1790          * is an instantaneous decision.
1791          *
1792          * Once a candidate, however, the socket must not be reinstalled into a
1793          * file descriptor while the garbage collection is in progress.
1794          *
1795          * If the above conditions are met, then the directed graph of
1796          * candidates (*) does not change while unix_gc_lock is held.
1797          *
1798          * Any operations that changes the file count through file descriptors
1799          * (dup, close, sendmsg) does not change the graph since candidates are
1800          * not installed in fds.
1801          *
1802          * Dequeing a candidate via recvmsg would install it into an fd, but
1803          * that takes unix_gc_lock to decrement the inflight count, so it's
1804          * serialized with garbage collection.
1805          *
1806          * MSG_PEEK is special in that it does not change the inflight count,
1807          * yet does install the socket into an fd.  The following lock/unlock
1808          * pair is to ensure serialization with garbage collection.  It must be
1809          * done between incrementing the file count and installing the file into
1810          * an fd.
1811          *
1812          * If garbage collection starts after the barrier provided by the
1813          * lock/unlock, then it will see the elevated refcount and not mark this
1814          * as a candidate.  If a garbage collection is already in progress
1815          * before the file count was incremented, then the lock/unlock pair will
1816          * ensure that garbage collection is finished before progressing to
1817          * installing the fd.
1818          *
1819          * (*) A -> B where B is on the queue of A or B is on the queue of C
1820          * which is on the queue of listening socket A.
1821          */
1822         spin_lock(&unix_gc_lock);
1823         spin_unlock(&unix_gc_lock);
1824 }
1825
1826 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1827 {
1828         int err = 0;
1829
1830         UNIXCB(skb).pid  = get_pid(scm->pid);
1831         UNIXCB(skb).uid = scm->creds.uid;
1832         UNIXCB(skb).gid = scm->creds.gid;
1833         UNIXCB(skb).fp = NULL;
1834         unix_get_secdata(scm, skb);
1835         if (scm->fp && send_fds)
1836                 err = unix_attach_fds(scm, skb);
1837
1838         skb->destructor = unix_destruct_scm;
1839         return err;
1840 }
1841
1842 static bool unix_passcred_enabled(const struct socket *sock,
1843                                   const struct sock *other)
1844 {
1845         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1846                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1847                !other->sk_socket ||
1848                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1849                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1850 }
1851
1852 /*
1853  * Some apps rely on write() giving SCM_CREDENTIALS
1854  * We include credentials if source or destination socket
1855  * asserted SOCK_PASSCRED.
1856  */
1857 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1858                             const struct sock *other)
1859 {
1860         if (UNIXCB(skb).pid)
1861                 return;
1862         if (unix_passcred_enabled(sock, other)) {
1863                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1864                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1865         }
1866 }
1867
1868 static bool unix_skb_scm_eq(struct sk_buff *skb,
1869                             struct scm_cookie *scm)
1870 {
1871         return UNIXCB(skb).pid == scm->pid &&
1872                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1873                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1874                unix_secdata_eq(scm, skb);
1875 }
1876
1877 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1878 {
1879         struct scm_fp_list *fp = UNIXCB(skb).fp;
1880         struct unix_sock *u = unix_sk(sk);
1881
1882         if (unlikely(fp && fp->count))
1883                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1884 }
1885
1886 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1887 {
1888         struct scm_fp_list *fp = UNIXCB(skb).fp;
1889         struct unix_sock *u = unix_sk(sk);
1890
1891         if (unlikely(fp && fp->count))
1892                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1893 }
1894
1895 /*
1896  *      Send AF_UNIX data.
1897  */
1898
1899 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1900                               size_t len)
1901 {
1902         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1903         struct sock *sk = sock->sk, *other = NULL;
1904         struct unix_sock *u = unix_sk(sk);
1905         struct scm_cookie scm;
1906         struct sk_buff *skb;
1907         int data_len = 0;
1908         int sk_locked;
1909         long timeo;
1910         int err;
1911
1912         wait_for_unix_gc();
1913         err = scm_send(sock, msg, &scm, false);
1914         if (err < 0)
1915                 return err;
1916
1917         err = -EOPNOTSUPP;
1918         if (msg->msg_flags&MSG_OOB)
1919                 goto out;
1920
1921         if (msg->msg_namelen) {
1922                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1923                 if (err)
1924                         goto out;
1925         } else {
1926                 sunaddr = NULL;
1927                 err = -ENOTCONN;
1928                 other = unix_peer_get(sk);
1929                 if (!other)
1930                         goto out;
1931         }
1932
1933         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1934              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1935                 err = unix_autobind(sk);
1936                 if (err)
1937                         goto out;
1938         }
1939
1940         err = -EMSGSIZE;
1941         if (len > sk->sk_sndbuf - 32)
1942                 goto out;
1943
1944         if (len > SKB_MAX_ALLOC) {
1945                 data_len = min_t(size_t,
1946                                  len - SKB_MAX_ALLOC,
1947                                  MAX_SKB_FRAGS * PAGE_SIZE);
1948                 data_len = PAGE_ALIGN(data_len);
1949
1950                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1951         }
1952
1953         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1954                                    msg->msg_flags & MSG_DONTWAIT, &err,
1955                                    PAGE_ALLOC_COSTLY_ORDER);
1956         if (skb == NULL)
1957                 goto out;
1958
1959         err = unix_scm_to_skb(&scm, skb, true);
1960         if (err < 0)
1961                 goto out_free;
1962
1963         skb_put(skb, len - data_len);
1964         skb->data_len = data_len;
1965         skb->len = len;
1966         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1967         if (err)
1968                 goto out_free;
1969
1970         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1971
1972 restart:
1973         if (!other) {
1974                 err = -ECONNRESET;
1975                 if (sunaddr == NULL)
1976                         goto out_free;
1977
1978                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1979                                         sk->sk_type);
1980                 if (IS_ERR(other)) {
1981                         err = PTR_ERR(other);
1982                         other = NULL;
1983                         goto out_free;
1984                 }
1985         }
1986
1987         if (sk_filter(other, skb) < 0) {
1988                 /* Toss the packet but do not return any error to the sender */
1989                 err = len;
1990                 goto out_free;
1991         }
1992
1993         sk_locked = 0;
1994         unix_state_lock(other);
1995 restart_locked:
1996         err = -EPERM;
1997         if (!unix_may_send(sk, other))
1998                 goto out_unlock;
1999
2000         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2001                 /*
2002                  *      Check with 1003.1g - what should
2003                  *      datagram error
2004                  */
2005                 unix_state_unlock(other);
2006                 sock_put(other);
2007
2008                 if (!sk_locked)
2009                         unix_state_lock(sk);
2010
2011                 err = 0;
2012                 if (sk->sk_type == SOCK_SEQPACKET) {
2013                         /* We are here only when racing with unix_release_sock()
2014                          * is clearing @other. Never change state to TCP_CLOSE
2015                          * unlike SOCK_DGRAM wants.
2016                          */
2017                         unix_state_unlock(sk);
2018                         err = -EPIPE;
2019                 } else if (unix_peer(sk) == other) {
2020                         unix_peer(sk) = NULL;
2021                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2022
2023                         sk->sk_state = TCP_CLOSE;
2024                         unix_state_unlock(sk);
2025
2026                         unix_dgram_disconnected(sk, other);
2027                         sock_put(other);
2028                         err = -ECONNREFUSED;
2029                 } else {
2030                         unix_state_unlock(sk);
2031                 }
2032
2033                 other = NULL;
2034                 if (err)
2035                         goto out_free;
2036                 goto restart;
2037         }
2038
2039         err = -EPIPE;
2040         if (other->sk_shutdown & RCV_SHUTDOWN)
2041                 goto out_unlock;
2042
2043         if (sk->sk_type != SOCK_SEQPACKET) {
2044                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2045                 if (err)
2046                         goto out_unlock;
2047         }
2048
2049         /* other == sk && unix_peer(other) != sk if
2050          * - unix_peer(sk) == NULL, destination address bound to sk
2051          * - unix_peer(sk) == sk by time of get but disconnected before lock
2052          */
2053         if (other != sk &&
2054             unlikely(unix_peer(other) != sk &&
2055             unix_recvq_full_lockless(other))) {
2056                 if (timeo) {
2057                         timeo = unix_wait_for_peer(other, timeo);
2058
2059                         err = sock_intr_errno(timeo);
2060                         if (signal_pending(current))
2061                                 goto out_free;
2062
2063                         goto restart;
2064                 }
2065
2066                 if (!sk_locked) {
2067                         unix_state_unlock(other);
2068                         unix_state_double_lock(sk, other);
2069                 }
2070
2071                 if (unix_peer(sk) != other ||
2072                     unix_dgram_peer_wake_me(sk, other)) {
2073                         err = -EAGAIN;
2074                         sk_locked = 1;
2075                         goto out_unlock;
2076                 }
2077
2078                 if (!sk_locked) {
2079                         sk_locked = 1;
2080                         goto restart_locked;
2081                 }
2082         }
2083
2084         if (unlikely(sk_locked))
2085                 unix_state_unlock(sk);
2086
2087         if (sock_flag(other, SOCK_RCVTSTAMP))
2088                 __net_timestamp(skb);
2089         maybe_add_creds(skb, sock, other);
2090         scm_stat_add(other, skb);
2091         skb_queue_tail(&other->sk_receive_queue, skb);
2092         unix_state_unlock(other);
2093         other->sk_data_ready(other);
2094         sock_put(other);
2095         scm_destroy(&scm);
2096         return len;
2097
2098 out_unlock:
2099         if (sk_locked)
2100                 unix_state_unlock(sk);
2101         unix_state_unlock(other);
2102 out_free:
2103         kfree_skb(skb);
2104 out:
2105         if (other)
2106                 sock_put(other);
2107         scm_destroy(&scm);
2108         return err;
2109 }
2110
2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2112  * bytes, and a minimum of a full page.
2113  */
2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2115
2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2118                      struct scm_cookie *scm, bool fds_sent)
2119 {
2120         struct unix_sock *ousk = unix_sk(other);
2121         struct sk_buff *skb;
2122         int err = 0;
2123
2124         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2125
2126         if (!skb)
2127                 return err;
2128
2129         err = unix_scm_to_skb(scm, skb, !fds_sent);
2130         if (err < 0) {
2131                 kfree_skb(skb);
2132                 return err;
2133         }
2134         skb_put(skb, 1);
2135         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2136
2137         if (err) {
2138                 kfree_skb(skb);
2139                 return err;
2140         }
2141
2142         unix_state_lock(other);
2143
2144         if (sock_flag(other, SOCK_DEAD) ||
2145             (other->sk_shutdown & RCV_SHUTDOWN)) {
2146                 unix_state_unlock(other);
2147                 kfree_skb(skb);
2148                 return -EPIPE;
2149         }
2150
2151         maybe_add_creds(skb, sock, other);
2152         skb_get(skb);
2153
2154         if (ousk->oob_skb)
2155                 consume_skb(ousk->oob_skb);
2156
2157         WRITE_ONCE(ousk->oob_skb, skb);
2158
2159         scm_stat_add(other, skb);
2160         skb_queue_tail(&other->sk_receive_queue, skb);
2161         sk_send_sigurg(other);
2162         unix_state_unlock(other);
2163         other->sk_data_ready(other);
2164
2165         return err;
2166 }
2167 #endif
2168
2169 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2170                                size_t len)
2171 {
2172         struct sock *sk = sock->sk;
2173         struct sock *other = NULL;
2174         int err, size;
2175         struct sk_buff *skb;
2176         int sent = 0;
2177         struct scm_cookie scm;
2178         bool fds_sent = false;
2179         int data_len;
2180
2181         wait_for_unix_gc();
2182         err = scm_send(sock, msg, &scm, false);
2183         if (err < 0)
2184                 return err;
2185
2186         err = -EOPNOTSUPP;
2187         if (msg->msg_flags & MSG_OOB) {
2188 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2189                 if (len)
2190                         len--;
2191                 else
2192 #endif
2193                         goto out_err;
2194         }
2195
2196         if (msg->msg_namelen) {
2197                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2198                 goto out_err;
2199         } else {
2200                 err = -ENOTCONN;
2201                 other = unix_peer(sk);
2202                 if (!other)
2203                         goto out_err;
2204         }
2205
2206         if (sk->sk_shutdown & SEND_SHUTDOWN)
2207                 goto pipe_err;
2208
2209         while (sent < len) {
2210                 size = len - sent;
2211
2212                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2213                         skb = sock_alloc_send_pskb(sk, 0, 0,
2214                                                    msg->msg_flags & MSG_DONTWAIT,
2215                                                    &err, 0);
2216                 } else {
2217                         /* Keep two messages in the pipe so it schedules better */
2218                         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2219
2220                         /* allow fallback to order-0 allocations */
2221                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2222
2223                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2224
2225                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2226
2227                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2228                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2229                                                    get_order(UNIX_SKB_FRAGS_SZ));
2230                 }
2231                 if (!skb)
2232                         goto out_err;
2233
2234                 /* Only send the fds in the first buffer */
2235                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2236                 if (err < 0) {
2237                         kfree_skb(skb);
2238                         goto out_err;
2239                 }
2240                 fds_sent = true;
2241
2242                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2243                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2244                                                    sk->sk_allocation);
2245                         if (err < 0) {
2246                                 kfree_skb(skb);
2247                                 goto out_err;
2248                         }
2249                         size = err;
2250                         refcount_add(size, &sk->sk_wmem_alloc);
2251                 } else {
2252                         skb_put(skb, size - data_len);
2253                         skb->data_len = data_len;
2254                         skb->len = size;
2255                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2256                         if (err) {
2257                                 kfree_skb(skb);
2258                                 goto out_err;
2259                         }
2260                 }
2261
2262                 unix_state_lock(other);
2263
2264                 if (sock_flag(other, SOCK_DEAD) ||
2265                     (other->sk_shutdown & RCV_SHUTDOWN))
2266                         goto pipe_err_free;
2267
2268                 maybe_add_creds(skb, sock, other);
2269                 scm_stat_add(other, skb);
2270                 skb_queue_tail(&other->sk_receive_queue, skb);
2271                 unix_state_unlock(other);
2272                 other->sk_data_ready(other);
2273                 sent += size;
2274         }
2275
2276 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2277         if (msg->msg_flags & MSG_OOB) {
2278                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2279                 if (err)
2280                         goto out_err;
2281                 sent++;
2282         }
2283 #endif
2284
2285         scm_destroy(&scm);
2286
2287         return sent;
2288
2289 pipe_err_free:
2290         unix_state_unlock(other);
2291         kfree_skb(skb);
2292 pipe_err:
2293         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2294                 send_sig(SIGPIPE, current, 0);
2295         err = -EPIPE;
2296 out_err:
2297         scm_destroy(&scm);
2298         return sent ? : err;
2299 }
2300
2301 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2302                                   size_t len)
2303 {
2304         int err;
2305         struct sock *sk = sock->sk;
2306
2307         err = sock_error(sk);
2308         if (err)
2309                 return err;
2310
2311         if (sk->sk_state != TCP_ESTABLISHED)
2312                 return -ENOTCONN;
2313
2314         if (msg->msg_namelen)
2315                 msg->msg_namelen = 0;
2316
2317         return unix_dgram_sendmsg(sock, msg, len);
2318 }
2319
2320 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2321                                   size_t size, int flags)
2322 {
2323         struct sock *sk = sock->sk;
2324
2325         if (sk->sk_state != TCP_ESTABLISHED)
2326                 return -ENOTCONN;
2327
2328         return unix_dgram_recvmsg(sock, msg, size, flags);
2329 }
2330
2331 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2332 {
2333         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2334
2335         if (addr) {
2336                 msg->msg_namelen = addr->len;
2337                 memcpy(msg->msg_name, addr->name, addr->len);
2338         }
2339 }
2340
2341 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2342                          int flags)
2343 {
2344         struct scm_cookie scm;
2345         struct socket *sock = sk->sk_socket;
2346         struct unix_sock *u = unix_sk(sk);
2347         struct sk_buff *skb, *last;
2348         long timeo;
2349         int skip;
2350         int err;
2351
2352         err = -EOPNOTSUPP;
2353         if (flags&MSG_OOB)
2354                 goto out;
2355
2356         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2357
2358         do {
2359                 mutex_lock(&u->iolock);
2360
2361                 skip = sk_peek_offset(sk, flags);
2362                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2363                                               &skip, &err, &last);
2364                 if (skb) {
2365                         if (!(flags & MSG_PEEK))
2366                                 scm_stat_del(sk, skb);
2367                         break;
2368                 }
2369
2370                 mutex_unlock(&u->iolock);
2371
2372                 if (err != -EAGAIN)
2373                         break;
2374         } while (timeo &&
2375                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2376                                               &err, &timeo, last));
2377
2378         if (!skb) { /* implies iolock unlocked */
2379                 unix_state_lock(sk);
2380                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2381                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2382                     (sk->sk_shutdown & RCV_SHUTDOWN))
2383                         err = 0;
2384                 unix_state_unlock(sk);
2385                 goto out;
2386         }
2387
2388         if (wq_has_sleeper(&u->peer_wait))
2389                 wake_up_interruptible_sync_poll(&u->peer_wait,
2390                                                 EPOLLOUT | EPOLLWRNORM |
2391                                                 EPOLLWRBAND);
2392
2393         if (msg->msg_name)
2394                 unix_copy_addr(msg, skb->sk);
2395
2396         if (size > skb->len - skip)
2397                 size = skb->len - skip;
2398         else if (size < skb->len - skip)
2399                 msg->msg_flags |= MSG_TRUNC;
2400
2401         err = skb_copy_datagram_msg(skb, skip, msg, size);
2402         if (err)
2403                 goto out_free;
2404
2405         if (sock_flag(sk, SOCK_RCVTSTAMP))
2406                 __sock_recv_timestamp(msg, sk, skb);
2407
2408         memset(&scm, 0, sizeof(scm));
2409
2410         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2411         unix_set_secdata(&scm, skb);
2412
2413         if (!(flags & MSG_PEEK)) {
2414                 if (UNIXCB(skb).fp)
2415                         unix_detach_fds(&scm, skb);
2416
2417                 sk_peek_offset_bwd(sk, skb->len);
2418         } else {
2419                 /* It is questionable: on PEEK we could:
2420                    - do not return fds - good, but too simple 8)
2421                    - return fds, and do not return them on read (old strategy,
2422                      apparently wrong)
2423                    - clone fds (I chose it for now, it is the most universal
2424                      solution)
2425
2426                    POSIX 1003.1g does not actually define this clearly
2427                    at all. POSIX 1003.1g doesn't define a lot of things
2428                    clearly however!
2429
2430                 */
2431
2432                 sk_peek_offset_fwd(sk, size);
2433
2434                 if (UNIXCB(skb).fp)
2435                         unix_peek_fds(&scm, skb);
2436         }
2437         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2438
2439         scm_recv_unix(sock, msg, &scm, flags);
2440
2441 out_free:
2442         skb_free_datagram(sk, skb);
2443         mutex_unlock(&u->iolock);
2444 out:
2445         return err;
2446 }
2447
2448 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2449                               int flags)
2450 {
2451         struct sock *sk = sock->sk;
2452
2453 #ifdef CONFIG_BPF_SYSCALL
2454         const struct proto *prot = READ_ONCE(sk->sk_prot);
2455
2456         if (prot != &unix_dgram_proto)
2457                 return prot->recvmsg(sk, msg, size, flags, NULL);
2458 #endif
2459         return __unix_dgram_recvmsg(sk, msg, size, flags);
2460 }
2461
2462 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2463 {
2464         struct unix_sock *u = unix_sk(sk);
2465         struct sk_buff *skb;
2466         int err;
2467
2468         mutex_lock(&u->iolock);
2469         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2470         mutex_unlock(&u->iolock);
2471         if (!skb)
2472                 return err;
2473
2474         return recv_actor(sk, skb);
2475 }
2476
2477 /*
2478  *      Sleep until more data has arrived. But check for races..
2479  */
2480 static long unix_stream_data_wait(struct sock *sk, long timeo,
2481                                   struct sk_buff *last, unsigned int last_len,
2482                                   bool freezable)
2483 {
2484         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2485         struct sk_buff *tail;
2486         DEFINE_WAIT(wait);
2487
2488         unix_state_lock(sk);
2489
2490         for (;;) {
2491                 prepare_to_wait(sk_sleep(sk), &wait, state);
2492
2493                 tail = skb_peek_tail(&sk->sk_receive_queue);
2494                 if (tail != last ||
2495                     (tail && tail->len != last_len) ||
2496                     sk->sk_err ||
2497                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2498                     signal_pending(current) ||
2499                     !timeo)
2500                         break;
2501
2502                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2503                 unix_state_unlock(sk);
2504                 timeo = schedule_timeout(timeo);
2505                 unix_state_lock(sk);
2506
2507                 if (sock_flag(sk, SOCK_DEAD))
2508                         break;
2509
2510                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2511         }
2512
2513         finish_wait(sk_sleep(sk), &wait);
2514         unix_state_unlock(sk);
2515         return timeo;
2516 }
2517
2518 static unsigned int unix_skb_len(const struct sk_buff *skb)
2519 {
2520         return skb->len - UNIXCB(skb).consumed;
2521 }
2522
2523 struct unix_stream_read_state {
2524         int (*recv_actor)(struct sk_buff *, int, int,
2525                           struct unix_stream_read_state *);
2526         struct socket *socket;
2527         struct msghdr *msg;
2528         struct pipe_inode_info *pipe;
2529         size_t size;
2530         int flags;
2531         unsigned int splice_flags;
2532 };
2533
2534 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2535 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2536 {
2537         struct socket *sock = state->socket;
2538         struct sock *sk = sock->sk;
2539         struct unix_sock *u = unix_sk(sk);
2540         int chunk = 1;
2541         struct sk_buff *oob_skb;
2542
2543         mutex_lock(&u->iolock);
2544         unix_state_lock(sk);
2545
2546         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2547                 unix_state_unlock(sk);
2548                 mutex_unlock(&u->iolock);
2549                 return -EINVAL;
2550         }
2551
2552         oob_skb = u->oob_skb;
2553
2554         if (!(state->flags & MSG_PEEK))
2555                 WRITE_ONCE(u->oob_skb, NULL);
2556
2557         unix_state_unlock(sk);
2558
2559         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2560
2561         if (!(state->flags & MSG_PEEK)) {
2562                 UNIXCB(oob_skb).consumed += 1;
2563                 kfree_skb(oob_skb);
2564         }
2565
2566         mutex_unlock(&u->iolock);
2567
2568         if (chunk < 0)
2569                 return -EFAULT;
2570
2571         state->msg->msg_flags |= MSG_OOB;
2572         return 1;
2573 }
2574
2575 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2576                                   int flags, int copied)
2577 {
2578         struct unix_sock *u = unix_sk(sk);
2579
2580         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2581                 skb_unlink(skb, &sk->sk_receive_queue);
2582                 consume_skb(skb);
2583                 skb = NULL;
2584         } else {
2585                 if (skb == u->oob_skb) {
2586                         if (copied) {
2587                                 skb = NULL;
2588                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2589                                 if (!(flags & MSG_PEEK)) {
2590                                         WRITE_ONCE(u->oob_skb, NULL);
2591                                         consume_skb(skb);
2592                                 }
2593                         } else if (!(flags & MSG_PEEK)) {
2594                                 skb_unlink(skb, &sk->sk_receive_queue);
2595                                 consume_skb(skb);
2596                                 skb = skb_peek(&sk->sk_receive_queue);
2597                         }
2598                 }
2599         }
2600         return skb;
2601 }
2602 #endif
2603
2604 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2605 {
2606         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2607                 return -ENOTCONN;
2608
2609         return unix_read_skb(sk, recv_actor);
2610 }
2611
2612 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2613                                     bool freezable)
2614 {
2615         struct scm_cookie scm;
2616         struct socket *sock = state->socket;
2617         struct sock *sk = sock->sk;
2618         struct unix_sock *u = unix_sk(sk);
2619         int copied = 0;
2620         int flags = state->flags;
2621         int noblock = flags & MSG_DONTWAIT;
2622         bool check_creds = false;
2623         int target;
2624         int err = 0;
2625         long timeo;
2626         int skip;
2627         size_t size = state->size;
2628         unsigned int last_len;
2629
2630         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2631                 err = -EINVAL;
2632                 goto out;
2633         }
2634
2635         if (unlikely(flags & MSG_OOB)) {
2636                 err = -EOPNOTSUPP;
2637 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2638                 err = unix_stream_recv_urg(state);
2639 #endif
2640                 goto out;
2641         }
2642
2643         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2644         timeo = sock_rcvtimeo(sk, noblock);
2645
2646         memset(&scm, 0, sizeof(scm));
2647
2648         /* Lock the socket to prevent queue disordering
2649          * while sleeps in memcpy_tomsg
2650          */
2651         mutex_lock(&u->iolock);
2652
2653         skip = max(sk_peek_offset(sk, flags), 0);
2654
2655         do {
2656                 int chunk;
2657                 bool drop_skb;
2658                 struct sk_buff *skb, *last;
2659
2660 redo:
2661                 unix_state_lock(sk);
2662                 if (sock_flag(sk, SOCK_DEAD)) {
2663                         err = -ECONNRESET;
2664                         goto unlock;
2665                 }
2666                 last = skb = skb_peek(&sk->sk_receive_queue);
2667                 last_len = last ? last->len : 0;
2668
2669 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2670                 if (skb) {
2671                         skb = manage_oob(skb, sk, flags, copied);
2672                         if (!skb) {
2673                                 unix_state_unlock(sk);
2674                                 if (copied)
2675                                         break;
2676                                 goto redo;
2677                         }
2678                 }
2679 #endif
2680 again:
2681                 if (skb == NULL) {
2682                         if (copied >= target)
2683                                 goto unlock;
2684
2685                         /*
2686                          *      POSIX 1003.1g mandates this order.
2687                          */
2688
2689                         err = sock_error(sk);
2690                         if (err)
2691                                 goto unlock;
2692                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2693                                 goto unlock;
2694
2695                         unix_state_unlock(sk);
2696                         if (!timeo) {
2697                                 err = -EAGAIN;
2698                                 break;
2699                         }
2700
2701                         mutex_unlock(&u->iolock);
2702
2703                         timeo = unix_stream_data_wait(sk, timeo, last,
2704                                                       last_len, freezable);
2705
2706                         if (signal_pending(current)) {
2707                                 err = sock_intr_errno(timeo);
2708                                 scm_destroy(&scm);
2709                                 goto out;
2710                         }
2711
2712                         mutex_lock(&u->iolock);
2713                         goto redo;
2714 unlock:
2715                         unix_state_unlock(sk);
2716                         break;
2717                 }
2718
2719                 while (skip >= unix_skb_len(skb)) {
2720                         skip -= unix_skb_len(skb);
2721                         last = skb;
2722                         last_len = skb->len;
2723                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2724                         if (!skb)
2725                                 goto again;
2726                 }
2727
2728                 unix_state_unlock(sk);
2729
2730                 if (check_creds) {
2731                         /* Never glue messages from different writers */
2732                         if (!unix_skb_scm_eq(skb, &scm))
2733                                 break;
2734                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2735                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2736                         /* Copy credentials */
2737                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2738                         unix_set_secdata(&scm, skb);
2739                         check_creds = true;
2740                 }
2741
2742                 /* Copy address just once */
2743                 if (state->msg && state->msg->msg_name) {
2744                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2745                                          state->msg->msg_name);
2746                         unix_copy_addr(state->msg, skb->sk);
2747                         sunaddr = NULL;
2748                 }
2749
2750                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2751                 skb_get(skb);
2752                 chunk = state->recv_actor(skb, skip, chunk, state);
2753                 drop_skb = !unix_skb_len(skb);
2754                 /* skb is only safe to use if !drop_skb */
2755                 consume_skb(skb);
2756                 if (chunk < 0) {
2757                         if (copied == 0)
2758                                 copied = -EFAULT;
2759                         break;
2760                 }
2761                 copied += chunk;
2762                 size -= chunk;
2763
2764                 if (drop_skb) {
2765                         /* the skb was touched by a concurrent reader;
2766                          * we should not expect anything from this skb
2767                          * anymore and assume it invalid - we can be
2768                          * sure it was dropped from the socket queue
2769                          *
2770                          * let's report a short read
2771                          */
2772                         err = 0;
2773                         break;
2774                 }
2775
2776                 /* Mark read part of skb as used */
2777                 if (!(flags & MSG_PEEK)) {
2778                         UNIXCB(skb).consumed += chunk;
2779
2780                         sk_peek_offset_bwd(sk, chunk);
2781
2782                         if (UNIXCB(skb).fp) {
2783                                 scm_stat_del(sk, skb);
2784                                 unix_detach_fds(&scm, skb);
2785                         }
2786
2787                         if (unix_skb_len(skb))
2788                                 break;
2789
2790                         skb_unlink(skb, &sk->sk_receive_queue);
2791                         consume_skb(skb);
2792
2793                         if (scm.fp)
2794                                 break;
2795                 } else {
2796                         /* It is questionable, see note in unix_dgram_recvmsg.
2797                          */
2798                         if (UNIXCB(skb).fp)
2799                                 unix_peek_fds(&scm, skb);
2800
2801                         sk_peek_offset_fwd(sk, chunk);
2802
2803                         if (UNIXCB(skb).fp)
2804                                 break;
2805
2806                         skip = 0;
2807                         last = skb;
2808                         last_len = skb->len;
2809                         unix_state_lock(sk);
2810                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2811                         if (skb)
2812                                 goto again;
2813                         unix_state_unlock(sk);
2814                         break;
2815                 }
2816         } while (size);
2817
2818         mutex_unlock(&u->iolock);
2819         if (state->msg)
2820                 scm_recv_unix(sock, state->msg, &scm, flags);
2821         else
2822                 scm_destroy(&scm);
2823 out:
2824         return copied ? : err;
2825 }
2826
2827 static int unix_stream_read_actor(struct sk_buff *skb,
2828                                   int skip, int chunk,
2829                                   struct unix_stream_read_state *state)
2830 {
2831         int ret;
2832
2833         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2834                                     state->msg, chunk);
2835         return ret ?: chunk;
2836 }
2837
2838 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2839                           size_t size, int flags)
2840 {
2841         struct unix_stream_read_state state = {
2842                 .recv_actor = unix_stream_read_actor,
2843                 .socket = sk->sk_socket,
2844                 .msg = msg,
2845                 .size = size,
2846                 .flags = flags
2847         };
2848
2849         return unix_stream_read_generic(&state, true);
2850 }
2851
2852 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2853                                size_t size, int flags)
2854 {
2855         struct unix_stream_read_state state = {
2856                 .recv_actor = unix_stream_read_actor,
2857                 .socket = sock,
2858                 .msg = msg,
2859                 .size = size,
2860                 .flags = flags
2861         };
2862
2863 #ifdef CONFIG_BPF_SYSCALL
2864         struct sock *sk = sock->sk;
2865         const struct proto *prot = READ_ONCE(sk->sk_prot);
2866
2867         if (prot != &unix_stream_proto)
2868                 return prot->recvmsg(sk, msg, size, flags, NULL);
2869 #endif
2870         return unix_stream_read_generic(&state, true);
2871 }
2872
2873 static int unix_stream_splice_actor(struct sk_buff *skb,
2874                                     int skip, int chunk,
2875                                     struct unix_stream_read_state *state)
2876 {
2877         return skb_splice_bits(skb, state->socket->sk,
2878                                UNIXCB(skb).consumed + skip,
2879                                state->pipe, chunk, state->splice_flags);
2880 }
2881
2882 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2883                                        struct pipe_inode_info *pipe,
2884                                        size_t size, unsigned int flags)
2885 {
2886         struct unix_stream_read_state state = {
2887                 .recv_actor = unix_stream_splice_actor,
2888                 .socket = sock,
2889                 .pipe = pipe,
2890                 .size = size,
2891                 .splice_flags = flags,
2892         };
2893
2894         if (unlikely(*ppos))
2895                 return -ESPIPE;
2896
2897         if (sock->file->f_flags & O_NONBLOCK ||
2898             flags & SPLICE_F_NONBLOCK)
2899                 state.flags = MSG_DONTWAIT;
2900
2901         return unix_stream_read_generic(&state, false);
2902 }
2903
2904 static int unix_shutdown(struct socket *sock, int mode)
2905 {
2906         struct sock *sk = sock->sk;
2907         struct sock *other;
2908
2909         if (mode < SHUT_RD || mode > SHUT_RDWR)
2910                 return -EINVAL;
2911         /* This maps:
2912          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2913          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2914          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2915          */
2916         ++mode;
2917
2918         unix_state_lock(sk);
2919         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2920         other = unix_peer(sk);
2921         if (other)
2922                 sock_hold(other);
2923         unix_state_unlock(sk);
2924         sk->sk_state_change(sk);
2925
2926         if (other &&
2927                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2928
2929                 int peer_mode = 0;
2930                 const struct proto *prot = READ_ONCE(other->sk_prot);
2931
2932                 if (prot->unhash)
2933                         prot->unhash(other);
2934                 if (mode&RCV_SHUTDOWN)
2935                         peer_mode |= SEND_SHUTDOWN;
2936                 if (mode&SEND_SHUTDOWN)
2937                         peer_mode |= RCV_SHUTDOWN;
2938                 unix_state_lock(other);
2939                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2940                 unix_state_unlock(other);
2941                 other->sk_state_change(other);
2942                 if (peer_mode == SHUTDOWN_MASK)
2943                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2944                 else if (peer_mode & RCV_SHUTDOWN)
2945                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2946         }
2947         if (other)
2948                 sock_put(other);
2949
2950         return 0;
2951 }
2952
2953 long unix_inq_len(struct sock *sk)
2954 {
2955         struct sk_buff *skb;
2956         long amount = 0;
2957
2958         if (sk->sk_state == TCP_LISTEN)
2959                 return -EINVAL;
2960
2961         spin_lock(&sk->sk_receive_queue.lock);
2962         if (sk->sk_type == SOCK_STREAM ||
2963             sk->sk_type == SOCK_SEQPACKET) {
2964                 skb_queue_walk(&sk->sk_receive_queue, skb)
2965                         amount += unix_skb_len(skb);
2966         } else {
2967                 skb = skb_peek(&sk->sk_receive_queue);
2968                 if (skb)
2969                         amount = skb->len;
2970         }
2971         spin_unlock(&sk->sk_receive_queue.lock);
2972
2973         return amount;
2974 }
2975 EXPORT_SYMBOL_GPL(unix_inq_len);
2976
2977 long unix_outq_len(struct sock *sk)
2978 {
2979         return sk_wmem_alloc_get(sk);
2980 }
2981 EXPORT_SYMBOL_GPL(unix_outq_len);
2982
2983 static int unix_open_file(struct sock *sk)
2984 {
2985         struct path path;
2986         struct file *f;
2987         int fd;
2988
2989         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2990                 return -EPERM;
2991
2992         if (!smp_load_acquire(&unix_sk(sk)->addr))
2993                 return -ENOENT;
2994
2995         path = unix_sk(sk)->path;
2996         if (!path.dentry)
2997                 return -ENOENT;
2998
2999         path_get(&path);
3000
3001         fd = get_unused_fd_flags(O_CLOEXEC);
3002         if (fd < 0)
3003                 goto out;
3004
3005         f = dentry_open(&path, O_PATH, current_cred());
3006         if (IS_ERR(f)) {
3007                 put_unused_fd(fd);
3008                 fd = PTR_ERR(f);
3009                 goto out;
3010         }
3011
3012         fd_install(fd, f);
3013 out:
3014         path_put(&path);
3015
3016         return fd;
3017 }
3018
3019 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3020 {
3021         struct sock *sk = sock->sk;
3022         long amount = 0;
3023         int err;
3024
3025         switch (cmd) {
3026         case SIOCOUTQ:
3027                 amount = unix_outq_len(sk);
3028                 err = put_user(amount, (int __user *)arg);
3029                 break;
3030         case SIOCINQ:
3031                 amount = unix_inq_len(sk);
3032                 if (amount < 0)
3033                         err = amount;
3034                 else
3035                         err = put_user(amount, (int __user *)arg);
3036                 break;
3037         case SIOCUNIXFILE:
3038                 err = unix_open_file(sk);
3039                 break;
3040 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3041         case SIOCATMARK:
3042                 {
3043                         struct sk_buff *skb;
3044                         int answ = 0;
3045
3046                         skb = skb_peek(&sk->sk_receive_queue);
3047                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3048                                 answ = 1;
3049                         err = put_user(answ, (int __user *)arg);
3050                 }
3051                 break;
3052 #endif
3053         default:
3054                 err = -ENOIOCTLCMD;
3055                 break;
3056         }
3057         return err;
3058 }
3059
3060 #ifdef CONFIG_COMPAT
3061 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3062 {
3063         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3064 }
3065 #endif
3066
3067 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3068 {
3069         struct sock *sk = sock->sk;
3070         __poll_t mask;
3071         u8 shutdown;
3072
3073         sock_poll_wait(file, sock, wait);
3074         mask = 0;
3075         shutdown = READ_ONCE(sk->sk_shutdown);
3076
3077         /* exceptional events? */
3078         if (READ_ONCE(sk->sk_err))
3079                 mask |= EPOLLERR;
3080         if (shutdown == SHUTDOWN_MASK)
3081                 mask |= EPOLLHUP;
3082         if (shutdown & RCV_SHUTDOWN)
3083                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3084
3085         /* readable? */
3086         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3087                 mask |= EPOLLIN | EPOLLRDNORM;
3088         if (sk_is_readable(sk))
3089                 mask |= EPOLLIN | EPOLLRDNORM;
3090 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3091         if (READ_ONCE(unix_sk(sk)->oob_skb))
3092                 mask |= EPOLLPRI;
3093 #endif
3094
3095         /* Connection-based need to check for termination and startup */
3096         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3097             sk->sk_state == TCP_CLOSE)
3098                 mask |= EPOLLHUP;
3099
3100         /*
3101          * we set writable also when the other side has shut down the
3102          * connection. This prevents stuck sockets.
3103          */
3104         if (unix_writable(sk))
3105                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3106
3107         return mask;
3108 }
3109
3110 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3111                                     poll_table *wait)
3112 {
3113         struct sock *sk = sock->sk, *other;
3114         unsigned int writable;
3115         __poll_t mask;
3116         u8 shutdown;
3117
3118         sock_poll_wait(file, sock, wait);
3119         mask = 0;
3120         shutdown = READ_ONCE(sk->sk_shutdown);
3121
3122         /* exceptional events? */
3123         if (READ_ONCE(sk->sk_err) ||
3124             !skb_queue_empty_lockless(&sk->sk_error_queue))
3125                 mask |= EPOLLERR |
3126                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3127
3128         if (shutdown & RCV_SHUTDOWN)
3129                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3130         if (shutdown == SHUTDOWN_MASK)
3131                 mask |= EPOLLHUP;
3132
3133         /* readable? */
3134         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3135                 mask |= EPOLLIN | EPOLLRDNORM;
3136         if (sk_is_readable(sk))
3137                 mask |= EPOLLIN | EPOLLRDNORM;
3138
3139         /* Connection-based need to check for termination and startup */
3140         if (sk->sk_type == SOCK_SEQPACKET) {
3141                 if (sk->sk_state == TCP_CLOSE)
3142                         mask |= EPOLLHUP;
3143                 /* connection hasn't started yet? */
3144                 if (sk->sk_state == TCP_SYN_SENT)
3145                         return mask;
3146         }
3147
3148         /* No write status requested, avoid expensive OUT tests. */
3149         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3150                 return mask;
3151
3152         writable = unix_writable(sk);
3153         if (writable) {
3154                 unix_state_lock(sk);
3155
3156                 other = unix_peer(sk);
3157                 if (other && unix_peer(other) != sk &&
3158                     unix_recvq_full_lockless(other) &&
3159                     unix_dgram_peer_wake_me(sk, other))
3160                         writable = 0;
3161
3162                 unix_state_unlock(sk);
3163         }
3164
3165         if (writable)
3166                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3167         else
3168                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3169
3170         return mask;
3171 }
3172
3173 #ifdef CONFIG_PROC_FS
3174
3175 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3176
3177 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3178 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3179 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3180
3181 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3182 {
3183         unsigned long offset = get_offset(*pos);
3184         unsigned long bucket = get_bucket(*pos);
3185         unsigned long count = 0;
3186         struct sock *sk;
3187
3188         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3189              sk; sk = sk_next(sk)) {
3190                 if (++count == offset)
3191                         break;
3192         }
3193
3194         return sk;
3195 }
3196
3197 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3198 {
3199         unsigned long bucket = get_bucket(*pos);
3200         struct net *net = seq_file_net(seq);
3201         struct sock *sk;
3202
3203         while (bucket < UNIX_HASH_SIZE) {
3204                 spin_lock(&net->unx.table.locks[bucket]);
3205
3206                 sk = unix_from_bucket(seq, pos);
3207                 if (sk)
3208                         return sk;
3209
3210                 spin_unlock(&net->unx.table.locks[bucket]);
3211
3212                 *pos = set_bucket_offset(++bucket, 1);
3213         }
3214
3215         return NULL;
3216 }
3217
3218 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3219                                   loff_t *pos)
3220 {
3221         unsigned long bucket = get_bucket(*pos);
3222
3223         sk = sk_next(sk);
3224         if (sk)
3225                 return sk;
3226
3227
3228         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3229
3230         *pos = set_bucket_offset(++bucket, 1);
3231
3232         return unix_get_first(seq, pos);
3233 }
3234
3235 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3236 {
3237         if (!*pos)
3238                 return SEQ_START_TOKEN;
3239
3240         return unix_get_first(seq, pos);
3241 }
3242
3243 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3244 {
3245         ++*pos;
3246
3247         if (v == SEQ_START_TOKEN)
3248                 return unix_get_first(seq, pos);
3249
3250         return unix_get_next(seq, v, pos);
3251 }
3252
3253 static void unix_seq_stop(struct seq_file *seq, void *v)
3254 {
3255         struct sock *sk = v;
3256
3257         if (sk)
3258                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3259 }
3260
3261 static int unix_seq_show(struct seq_file *seq, void *v)
3262 {
3263
3264         if (v == SEQ_START_TOKEN)
3265                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3266                          "Inode Path\n");
3267         else {
3268                 struct sock *s = v;
3269                 struct unix_sock *u = unix_sk(s);
3270                 unix_state_lock(s);
3271
3272                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3273                         s,
3274                         refcount_read(&s->sk_refcnt),
3275                         0,
3276                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3277                         s->sk_type,
3278                         s->sk_socket ?
3279                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3280                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3281                         sock_i_ino(s));
3282
3283                 if (u->addr) {  // under a hash table lock here
3284                         int i, len;
3285                         seq_putc(seq, ' ');
3286
3287                         i = 0;
3288                         len = u->addr->len -
3289                                 offsetof(struct sockaddr_un, sun_path);
3290                         if (u->addr->name->sun_path[0]) {
3291                                 len--;
3292                         } else {
3293                                 seq_putc(seq, '@');
3294                                 i++;
3295                         }
3296                         for ( ; i < len; i++)
3297                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3298                                          '@');
3299                 }
3300                 unix_state_unlock(s);
3301                 seq_putc(seq, '\n');
3302         }
3303
3304         return 0;
3305 }
3306
3307 static const struct seq_operations unix_seq_ops = {
3308         .start  = unix_seq_start,
3309         .next   = unix_seq_next,
3310         .stop   = unix_seq_stop,
3311         .show   = unix_seq_show,
3312 };
3313
3314 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3315 struct bpf_unix_iter_state {
3316         struct seq_net_private p;
3317         unsigned int cur_sk;
3318         unsigned int end_sk;
3319         unsigned int max_sk;
3320         struct sock **batch;
3321         bool st_bucket_done;
3322 };
3323
3324 struct bpf_iter__unix {
3325         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3326         __bpf_md_ptr(struct unix_sock *, unix_sk);
3327         uid_t uid __aligned(8);
3328 };
3329
3330 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3331                               struct unix_sock *unix_sk, uid_t uid)
3332 {
3333         struct bpf_iter__unix ctx;
3334
3335         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3336         ctx.meta = meta;
3337         ctx.unix_sk = unix_sk;
3338         ctx.uid = uid;
3339         return bpf_iter_run_prog(prog, &ctx);
3340 }
3341
3342 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3343
3344 {
3345         struct bpf_unix_iter_state *iter = seq->private;
3346         unsigned int expected = 1;
3347         struct sock *sk;
3348
3349         sock_hold(start_sk);
3350         iter->batch[iter->end_sk++] = start_sk;
3351
3352         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3353                 if (iter->end_sk < iter->max_sk) {
3354                         sock_hold(sk);
3355                         iter->batch[iter->end_sk++] = sk;
3356                 }
3357
3358                 expected++;
3359         }
3360
3361         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3362
3363         return expected;
3364 }
3365
3366 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3367 {
3368         while (iter->cur_sk < iter->end_sk)
3369                 sock_put(iter->batch[iter->cur_sk++]);
3370 }
3371
3372 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3373                                        unsigned int new_batch_sz)
3374 {
3375         struct sock **new_batch;
3376
3377         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3378                              GFP_USER | __GFP_NOWARN);
3379         if (!new_batch)
3380                 return -ENOMEM;
3381
3382         bpf_iter_unix_put_batch(iter);
3383         kvfree(iter->batch);
3384         iter->batch = new_batch;
3385         iter->max_sk = new_batch_sz;
3386
3387         return 0;
3388 }
3389
3390 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3391                                         loff_t *pos)
3392 {
3393         struct bpf_unix_iter_state *iter = seq->private;
3394         unsigned int expected;
3395         bool resized = false;
3396         struct sock *sk;
3397
3398         if (iter->st_bucket_done)
3399                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3400
3401 again:
3402         /* Get a new batch */
3403         iter->cur_sk = 0;
3404         iter->end_sk = 0;
3405
3406         sk = unix_get_first(seq, pos);
3407         if (!sk)
3408                 return NULL; /* Done */
3409
3410         expected = bpf_iter_unix_hold_batch(seq, sk);
3411
3412         if (iter->end_sk == expected) {
3413                 iter->st_bucket_done = true;
3414                 return sk;
3415         }
3416
3417         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3418                 resized = true;
3419                 goto again;
3420         }
3421
3422         return sk;
3423 }
3424
3425 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3426 {
3427         if (!*pos)
3428                 return SEQ_START_TOKEN;
3429
3430         /* bpf iter does not support lseek, so it always
3431          * continue from where it was stop()-ped.
3432          */
3433         return bpf_iter_unix_batch(seq, pos);
3434 }
3435
3436 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3437 {
3438         struct bpf_unix_iter_state *iter = seq->private;
3439         struct sock *sk;
3440
3441         /* Whenever seq_next() is called, the iter->cur_sk is
3442          * done with seq_show(), so advance to the next sk in
3443          * the batch.
3444          */
3445         if (iter->cur_sk < iter->end_sk)
3446                 sock_put(iter->batch[iter->cur_sk++]);
3447
3448         ++*pos;
3449
3450         if (iter->cur_sk < iter->end_sk)
3451                 sk = iter->batch[iter->cur_sk];
3452         else
3453                 sk = bpf_iter_unix_batch(seq, pos);
3454
3455         return sk;
3456 }
3457
3458 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3459 {
3460         struct bpf_iter_meta meta;
3461         struct bpf_prog *prog;
3462         struct sock *sk = v;
3463         uid_t uid;
3464         bool slow;
3465         int ret;
3466
3467         if (v == SEQ_START_TOKEN)
3468                 return 0;
3469
3470         slow = lock_sock_fast(sk);
3471
3472         if (unlikely(sk_unhashed(sk))) {
3473                 ret = SEQ_SKIP;
3474                 goto unlock;
3475         }
3476
3477         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3478         meta.seq = seq;
3479         prog = bpf_iter_get_info(&meta, false);
3480         ret = unix_prog_seq_show(prog, &meta, v, uid);
3481 unlock:
3482         unlock_sock_fast(sk, slow);
3483         return ret;
3484 }
3485
3486 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3487 {
3488         struct bpf_unix_iter_state *iter = seq->private;
3489         struct bpf_iter_meta meta;
3490         struct bpf_prog *prog;
3491
3492         if (!v) {
3493                 meta.seq = seq;
3494                 prog = bpf_iter_get_info(&meta, true);
3495                 if (prog)
3496                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3497         }
3498
3499         if (iter->cur_sk < iter->end_sk)
3500                 bpf_iter_unix_put_batch(iter);
3501 }
3502
3503 static const struct seq_operations bpf_iter_unix_seq_ops = {
3504         .start  = bpf_iter_unix_seq_start,
3505         .next   = bpf_iter_unix_seq_next,
3506         .stop   = bpf_iter_unix_seq_stop,
3507         .show   = bpf_iter_unix_seq_show,
3508 };
3509 #endif
3510 #endif
3511
3512 static const struct net_proto_family unix_family_ops = {
3513         .family = PF_UNIX,
3514         .create = unix_create,
3515         .owner  = THIS_MODULE,
3516 };
3517
3518
3519 static int __net_init unix_net_init(struct net *net)
3520 {
3521         int i;
3522
3523         net->unx.sysctl_max_dgram_qlen = 10;
3524         if (unix_sysctl_register(net))
3525                 goto out;
3526
3527 #ifdef CONFIG_PROC_FS
3528         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3529                              sizeof(struct seq_net_private)))
3530                 goto err_sysctl;
3531 #endif
3532
3533         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3534                                               sizeof(spinlock_t), GFP_KERNEL);
3535         if (!net->unx.table.locks)
3536                 goto err_proc;
3537
3538         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3539                                                 sizeof(struct hlist_head),
3540                                                 GFP_KERNEL);
3541         if (!net->unx.table.buckets)
3542                 goto free_locks;
3543
3544         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3545                 spin_lock_init(&net->unx.table.locks[i]);
3546                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3547         }
3548
3549         return 0;
3550
3551 free_locks:
3552         kvfree(net->unx.table.locks);
3553 err_proc:
3554 #ifdef CONFIG_PROC_FS
3555         remove_proc_entry("unix", net->proc_net);
3556 err_sysctl:
3557 #endif
3558         unix_sysctl_unregister(net);
3559 out:
3560         return -ENOMEM;
3561 }
3562
3563 static void __net_exit unix_net_exit(struct net *net)
3564 {
3565         kvfree(net->unx.table.buckets);
3566         kvfree(net->unx.table.locks);
3567         unix_sysctl_unregister(net);
3568         remove_proc_entry("unix", net->proc_net);
3569 }
3570
3571 static struct pernet_operations unix_net_ops = {
3572         .init = unix_net_init,
3573         .exit = unix_net_exit,
3574 };
3575
3576 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3577 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3578                      struct unix_sock *unix_sk, uid_t uid)
3579
3580 #define INIT_BATCH_SZ 16
3581
3582 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3583 {
3584         struct bpf_unix_iter_state *iter = priv_data;
3585         int err;
3586
3587         err = bpf_iter_init_seq_net(priv_data, aux);
3588         if (err)
3589                 return err;
3590
3591         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3592         if (err) {
3593                 bpf_iter_fini_seq_net(priv_data);
3594                 return err;
3595         }
3596
3597         return 0;
3598 }
3599
3600 static void bpf_iter_fini_unix(void *priv_data)
3601 {
3602         struct bpf_unix_iter_state *iter = priv_data;
3603
3604         bpf_iter_fini_seq_net(priv_data);
3605         kvfree(iter->batch);
3606 }
3607
3608 static const struct bpf_iter_seq_info unix_seq_info = {
3609         .seq_ops                = &bpf_iter_unix_seq_ops,
3610         .init_seq_private       = bpf_iter_init_unix,
3611         .fini_seq_private       = bpf_iter_fini_unix,
3612         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3613 };
3614
3615 static const struct bpf_func_proto *
3616 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3617                              const struct bpf_prog *prog)
3618 {
3619         switch (func_id) {
3620         case BPF_FUNC_setsockopt:
3621                 return &bpf_sk_setsockopt_proto;
3622         case BPF_FUNC_getsockopt:
3623                 return &bpf_sk_getsockopt_proto;
3624         default:
3625                 return NULL;
3626         }
3627 }
3628
3629 static struct bpf_iter_reg unix_reg_info = {
3630         .target                 = "unix",
3631         .ctx_arg_info_size      = 1,
3632         .ctx_arg_info           = {
3633                 { offsetof(struct bpf_iter__unix, unix_sk),
3634                   PTR_TO_BTF_ID_OR_NULL },
3635         },
3636         .get_func_proto         = bpf_iter_unix_get_func_proto,
3637         .seq_info               = &unix_seq_info,
3638 };
3639
3640 static void __init bpf_iter_register(void)
3641 {
3642         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3643         if (bpf_iter_reg_target(&unix_reg_info))
3644                 pr_warn("Warning: could not register bpf iterator unix\n");
3645 }
3646 #endif
3647
3648 static int __init af_unix_init(void)
3649 {
3650         int i, rc = -1;
3651
3652         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3653
3654         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3655                 spin_lock_init(&bsd_socket_locks[i]);
3656                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3657         }
3658
3659         rc = proto_register(&unix_dgram_proto, 1);
3660         if (rc != 0) {
3661                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3662                 goto out;
3663         }
3664
3665         rc = proto_register(&unix_stream_proto, 1);
3666         if (rc != 0) {
3667                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3668                 proto_unregister(&unix_dgram_proto);
3669                 goto out;
3670         }
3671
3672         sock_register(&unix_family_ops);
3673         register_pernet_subsys(&unix_net_ops);
3674         unix_bpf_build_proto();
3675
3676 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3677         bpf_iter_register();
3678 #endif
3679
3680 out:
3681         return rc;
3682 }
3683
3684 static void __exit af_unix_exit(void)
3685 {
3686         sock_unregister(PF_UNIX);
3687         proto_unregister(&unix_dgram_proto);
3688         proto_unregister(&unix_stream_proto);
3689         unregister_pernet_subsys(&unix_net_ops);
3690 }
3691
3692 /* Earlier than device_initcall() so that other drivers invoking
3693    request_module() don't end up in a loop when modprobe tries
3694    to use a UNIX socket. But later than subsys_initcall() because
3695    we depend on stuff initialised there */
3696 fs_initcall(af_unix_init);
3697 module_exit(af_unix_exit);
3698
3699 MODULE_LICENSE("GPL");
3700 MODULE_ALIAS_NETPROTO(PF_UNIX);