fs/fcntl.c

   1 /*
   2  *  linux/fs/fcntl.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 #include <linux/init.h>
   8 #include <linux/mm.h>
   9 #include <linux/file.h>
  10 #include <linux/dnotify.h>
  11 #include <linux/smp_lock.h>
  12 #include <linux/slab.h>
  13 #include <linux/iobuf.h>
  14 #include <linux/ptrace.h>
  15
  16 #include <asm/poll.h>
  17 #include <asm/siginfo.h>
  18 #include <asm/uaccess.h>
  19
  20 extern int sock_fcntl (struct file *, unsigned int cmd, unsigned long arg);
  21 extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
  22 extern int fcntl_getlease(struct file *filp);
  23
  24 /* Expand files.  Return <0 on error; 0 nothing done; 1 files expanded,
  25  * we may have blocked.
  26  *
  27  * Should be called with the files->file_lock spinlock held for write.
  28  */
  29 static int expand_files(struct files_struct *files, int nr)
  30 {
  31         int err, expand = 0;
  32 #ifdef FDSET_DEBUG
  33         printk (KERN_ERR __FUNCTION__ " %d: nr = %d\n", current->pid, nr);
  34 #endif
  35
  36         if (nr >= files->max_fdset) {
  37                 expand = 1;
  38                 if ((err = expand_fdset(files, nr)))
  39                         goto out;
  40         }
  41         if (nr >= files->max_fds) {
  42                 expand = 1;
  43                 if ((err = expand_fd_array(files, nr)))
  44                         goto out;
  45         }
  46         err = expand;
  47  out:
  48 #ifdef FDSET_DEBUG
  49         if (err)
  50                 printk (KERN_ERR __FUNCTION__ " %d: return %d\n", current->pid, err);
  51 #endif
  52         return err;
  53 }
  54
  55 /*
  56  * locate_fd finds a free file descriptor in the open_fds fdset,
  57  * expanding the fd arrays if necessary.  The files write lock will be
  58  * held on exit to ensure that the fd can be entered atomically.
  59  */
  60
  61 static int locate_fd(struct files_struct *files,
  62                             struct file *file, int orig_start)
  63 {
  64         unsigned int newfd;
  65         int error;
  66         int start;
  67
  68         write_lock(&files->file_lock);
  69
  70         error = -EINVAL;
  71         if (orig_start >= current->rlim[RLIMIT_NOFILE].rlim_cur)
  72                 goto out;
  73
  74 repeat:
  75         /*
  76          * Someone might have closed fd's in the range
  77          * orig_start..files->next_fd
  78          */
  79         start = orig_start;
  80         if (start < files->next_fd)
  81                 start = files->next_fd;
  82
  83         newfd = start;
  84         if (start < files->max_fdset) {
  85                 newfd = find_next_zero_bit(files->open_fds->fds_bits,
  86                         files->max_fdset, start);
  87         }
  88
  89         error = -EMFILE;
  90         if (newfd >= current->rlim[RLIMIT_NOFILE].rlim_cur)
  91                 goto out;
  92
  93         error = expand_files(files, newfd);
  94         if (error < 0)
  95                 goto out;
  96
  97         /*
  98          * If we needed to expand the fs array we
  99          * might have blocked - try again.
 100          */
 101         if (error)
 102                 goto repeat;
 103
 104         if (start <= files->next_fd)
 105                 files->next_fd = newfd + 1;
 106
 107         error = newfd;
 108
 109 out:
 110         return error;
 111 }
 112
 113 static inline void allocate_fd(struct files_struct *files,
 114                                         struct file *file, int fd)
 115 {
 116         FD_SET(fd, files->open_fds);
 117         FD_CLR(fd, files->close_on_exec);
 118         write_unlock(&files->file_lock);
 119         fd_install(fd, file);
 120 }
 121
 122 static int dupfd(struct file *file, int start)
 123 {
 124         struct files_struct * files = current->files;
 125         int ret;
 126
 127         ret = locate_fd(files, file, start);
 128         if (ret < 0)
 129                 goto out_putf;
 130         allocate_fd(files, file, ret);
 131         return ret;
 132
 133 out_putf:
 134         write_unlock(&files->file_lock);
 135         fput(file);
 136         return ret;
 137 }
 138
 139 asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 140 {
 141         int err = -EBADF;
 142         struct file * file, *tofree;
 143         struct files_struct * files = current->files;
 144
 145         write_lock(&files->file_lock);
 146         if (!(file = fcheck(oldfd)))
 147                 goto out_unlock;
 148         err = newfd;
 149         if (newfd == oldfd)
 150                 goto out_unlock;
 151         err = -EBADF;
 152         if (newfd >= current->rlim[RLIMIT_NOFILE].rlim_cur)
 153                 goto out_unlock;
 154         get_file(file);                 /* We are now finished with oldfd */
 155
 156         err = expand_files(files, newfd);
 157         if (err < 0)
 158                 goto out_fput;
 159
 160         /* To avoid races with open() and dup(), we will mark the fd as
 161          * in-use in the open-file bitmap throughout the entire dup2()
 162          * process.  This is quite safe: do_close() uses the fd array
 163          * entry, not the bitmap, to decide what work needs to be
 164          * done.  --sct */
 165         /* Doesn't work. open() might be there first. --AV */
 166
 167         /* Yes. It's a race. In user space. Nothing sane to do */
 168         err = -EBUSY;
 169         tofree = files->fd[newfd];
 170         if (!tofree && FD_ISSET(newfd, files->open_fds))
 171                 goto out_fput;
 172
 173         files->fd[newfd] = file;
 174         FD_SET(newfd, files->open_fds);
 175         FD_CLR(newfd, files->close_on_exec);
 176         write_unlock(&files->file_lock);
 177
 178         if (tofree)
 179                 filp_close(tofree, files);
 180         err = newfd;
 181 out:
 182         return err;
 183 out_unlock:
 184         write_unlock(&files->file_lock);
 185         goto out;
 186
 187 out_fput:
 188         write_unlock(&files->file_lock);
 189         fput(file);
 190         goto out;
 191 }
 192
 193 asmlinkage long sys_dup(unsigned int fildes)
 194 {
 195         int ret = -EBADF;
 196         struct file * file = fget(fildes);
 197
 198         if (file)
 199                 ret = dupfd(file, 0);
 200         return ret;
 201 }
 202
 203 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT)
 204
 205 static int setfl(int fd, struct file * filp, unsigned long arg)
 206 {
 207         struct inode * inode = filp->f_dentry->d_inode;
 208         int error;
 209
 210         /*
 211          * In the case of an append-only file, O_APPEND
 212          * cannot be cleared
 213          */
 214         if (!(arg & O_APPEND) && IS_APPEND(inode))
 215                 return -EPERM;
 216
 217         /* Did FASYNC state change? */
 218         if ((arg ^ filp->f_flags) & FASYNC) {
 219                 if (filp->f_op && filp->f_op->fasync) {
 220                         error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
 221                         if (error < 0)
 222                                 return error;
 223                 }
 224         }
 225
 226         if (arg & O_DIRECT) {
 227                 /*
 228                  * alloc_kiovec() can sleep and we are only serialized by
 229                  * the big kernel lock here, so abuse the i_sem to serialize
 230                  * this case too. We of course wouldn't need to go deep down
 231                  * to the inode layer, we could stay at the file layer, but
 232                  * we don't want to pay for the memory of a semaphore in each
 233                  * file structure too and we use the inode semaphore that we just
 234                  * pay for anyways.
 235                  */
 236                 error = 0;
 237                 down(&inode->i_sem);
 238                 if (!filp->f_iobuf)
 239                         error = alloc_kiovec(1, &filp->f_iobuf);
 240                 up(&inode->i_sem);
 241                 if (error < 0)
 242                         return error;
 243         }
 244
 245         /* required for strict SunOS emulation */
 246         if (O_NONBLOCK != O_NDELAY)
 247                if (arg & O_NDELAY)
 248                    arg |= O_NONBLOCK;
 249
 250         filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
 251         return 0;
 252 }
 253
 254 static long do_fcntl(unsigned int fd, unsigned int cmd,
 255                      unsigned long arg, struct file * filp)
 256 {
 257         long err = -EINVAL;
 258
 259         switch (cmd) {
 260                 case F_DUPFD:
 261                         if (arg < NR_OPEN) {
 262                                 get_file(filp);
 263                                 err = dupfd(filp, arg);
 264                         }
 265                         break;
 266                 case F_GETFD:
 267                         err = get_close_on_exec(fd);
 268                         break;
 269                 case F_SETFD:
 270                         err = 0;
 271                         set_close_on_exec(fd, arg&1);
 272                         break;
 273                 case F_GETFL:
 274                         err = filp->f_flags;
 275                         break;
 276                 case F_SETFL:
 277                         lock_kernel();
 278                         err = setfl(fd, filp, arg);
 279                         unlock_kernel();
 280                         break;
 281                 case F_GETLK:
 282                         err = fcntl_getlk(fd, (struct flock *) arg);
 283                         break;
 284                 case F_SETLK:
 285                 case F_SETLKW:
 286                         err = fcntl_setlk(fd, cmd, (struct flock *) arg);
 287                         break;
 288                 case F_GETOWN:
 289                         /*
 290                          * XXX If f_owner is a process group, the
 291                          * negative return value will get converted
 292                          * into an error.  Oops.  If we keep the
 293                          * current syscall conventions, the only way
 294                          * to fix this will be in libc.
 295                          */
 296                         err = filp->f_owner.pid;
 297                         force_successful_syscall_return();
 298                         break;
 299                 case F_SETOWN:
 300                         lock_kernel();
 301                         filp->f_owner.pid = arg;
 302                         filp->f_owner.uid = current->uid;
 303                         filp->f_owner.euid = current->euid;
 304                         err = 0;
 305                         if (S_ISSOCK (filp->f_dentry->d_inode->i_mode))
 306                                 err = sock_fcntl (filp, F_SETOWN, arg);
 307                         unlock_kernel();
 308                         break;
 309                 case F_GETSIG:
 310                         err = filp->f_owner.signum;
 311                         break;
 312                 case F_SETSIG:
 313                         /* arg == 0 restores default behaviour. */
 314                         if (arg < 0 || arg > _NSIG) {
 315                                 break;
 316                         }
 317                         err = 0;
 318                         filp->f_owner.signum = arg;
 319                         break;
 320                 case F_GETLEASE:
 321                         err = fcntl_getlease(filp);
 322                         break;
 323                 case F_SETLEASE:
 324                         err = fcntl_setlease(fd, filp, arg);
 325                         break;
 326                 case F_NOTIFY:
 327                         err = fcntl_dirnotify(fd, filp, arg);
 328                         break;
 329                 default:
 330                         /* sockets need a few special fcntls. */
 331                         err = -EINVAL;
 332                         if (S_ISSOCK (filp->f_dentry->d_inode->i_mode))
 333                                 err = sock_fcntl (filp, cmd, arg);
 334                         break;
 335         }
 336
 337         return err;
 338 }
 339
 340 asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
 341 {
 342         struct file * filp;
 343         long err = -EBADF;
 344
 345         filp = fget(fd);
 346         if (!filp)
 347                 goto out;
 348
 349         err = do_fcntl(fd, cmd, arg, filp);
 350
 351         fput(filp);
 352 out:
 353         return err;
 354 }
 355
 356 #if BITS_PER_LONG == 32
 357 asmlinkage long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg)
 358 {
 359         struct file * filp;
 360         long err;
 361
 362         err = -EBADF;
 363         filp = fget(fd);
 364         if (!filp)
 365                 goto out;
 366
 367         switch (cmd) {
 368                 case F_GETLK64:
 369                         err = fcntl_getlk64(fd, (struct flock64 *) arg);
 370                         break;
 371                 case F_SETLK64:
 372                         err = fcntl_setlk64(fd, cmd, (struct flock64 *) arg);
 373                         break;
 374                 case F_SETLKW64:
 375                         err = fcntl_setlk64(fd, cmd, (struct flock64 *) arg);
 376                         break;
 377                 default:
 378                         err = do_fcntl(fd, cmd, arg, filp);
 379                         break;
 380         }
 381         fput(filp);
 382 out:
 383         return err;
 384 }
 385 #endif
 386
 387 /* Table to convert sigio signal codes into poll band bitmaps */
 388
 389 static long band_table[NSIGPOLL] = {
 390         POLLIN | POLLRDNORM,                    /* POLL_IN */
 391         POLLOUT | POLLWRNORM | POLLWRBAND,      /* POLL_OUT */
 392         POLLIN | POLLRDNORM | POLLMSG,          /* POLL_MSG */
 393         POLLERR,                                /* POLL_ERR */
 394         POLLPRI | POLLRDBAND,                   /* POLL_PRI */
 395         POLLHUP | POLLERR                       /* POLL_HUP */
 396 };
 397
 398 static void send_sigio_to_task(struct task_struct *p,
 399                                struct fown_struct *fown,
 400                                int fd,
 401                                int reason)
 402 {
 403         if ((fown->euid != 0) &&
 404             (fown->euid ^ p->suid) && (fown->euid ^ p->uid) &&
 405             (fown->uid ^ p->suid) && (fown->uid ^ p->uid))
 406                 return;
 407         switch (fown->signum) {
 408                 siginfo_t si;
 409                 default:
 410                         /* Queue a rt signal with the appropriate fd as its
 411                            value.  We use SI_SIGIO as the source, not
 412                            SI_KERNEL, since kernel signals always get
 413                            delivered even if we can't queue.  Failure to
 414                            queue in this case _should_ be reported; we fall
 415                            back to SIGIO in that case. --sct */
 416                         si.si_signo = fown->signum;
 417                         si.si_errno = 0;
 418                         si.si_code  = reason;
 419                         /* Make sure we are called with one of the POLL_*
 420                            reasons, otherwise we could leak kernel stack into
 421                            userspace.  */
 422                         if ((reason & __SI_MASK) != __SI_POLL)
 423                                 BUG();
 424                         if (reason - POLL_IN >= NSIGPOLL)
 425                                 si.si_band  = ~0L;
 426                         else
 427                                 si.si_band = band_table[reason - POLL_IN];
 428                         si.si_fd    = fd;
 429                         if (!send_sig_info(fown->signum, &si, p))
 430                                 break;
 431                 /* fall-through: fall back on the old plain SIGIO signal */
 432                 case 0:
 433                         send_sig(SIGIO, p, 1);
 434         }
 435 }
 436
 437 void send_sigio(struct fown_struct *fown, int fd, int band)
 438 {
 439         struct task_struct * p;
 440         int   pid       = fown->pid;
 441
 442         read_lock(&tasklist_lock);
 443         if ( (pid > 0) && (p = find_task_by_pid(pid)) ) {
 444                 send_sigio_to_task(p, fown, fd, band);
 445                 goto out;
 446         }
 447         for_each_task(p) {
 448                 int match = p->pid;
 449                 if (pid < 0)
 450                         match = -p->pgrp;
 451                 if (pid != match)
 452                         continue;
 453                 send_sigio_to_task(p, fown, fd, band);
 454         }
 455 out:
 456         read_unlock(&tasklist_lock);
 457 }
 458
 459 static rwlock_t fasync_lock = RW_LOCK_UNLOCKED;
 460 static kmem_cache_t *fasync_cache;
 461
 462 /*
 463  * fasync_helper() is used by some character device drivers (mainly mice)
 464  * to set up the fasync queue. It returns negative on error, 0 if it did
 465  * no changes and positive if it added/deleted the entry.
 466  */
 467 int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
 468 {
 469         struct fasync_struct *fa, **fp;
 470         struct fasync_struct *new = NULL;
 471         int result = 0;
 472
 473         if (on) {
 474                 new = kmem_cache_alloc(fasync_cache, SLAB_KERNEL);
 475                 if (!new)
 476                         return -ENOMEM;
 477         }
 478         write_lock_irq(&fasync_lock);
 479         for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
 480                 if (fa->fa_file == filp) {
 481                         if(on) {
 482                                 fa->fa_fd = fd;
 483                                 kmem_cache_free(fasync_cache, new);
 484                         } else {
 485                                 *fp = fa->fa_next;
 486                                 kmem_cache_free(fasync_cache, fa);
 487                                 result = 1;
 488                         }
 489                         goto out;
 490                 }
 491         }
 492
 493         if (on) {
 494                 new->magic = FASYNC_MAGIC;
 495                 new->fa_file = filp;
 496                 new->fa_fd = fd;
 497                 new->fa_next = *fapp;
 498                 *fapp = new;
 499                 result = 1;
 500         }
 501 out:
 502         write_unlock_irq(&fasync_lock);
 503         return result;
 504 }
 505
 506 void __kill_fasync(struct fasync_struct *fa, int sig, int band)
 507 {
 508         while (fa) {
 509                 struct fown_struct * fown;
 510                 if (fa->magic != FASYNC_MAGIC) {
 511                         printk(KERN_ERR "kill_fasync: bad magic number in "
 512                                "fasync_struct!\n");
 513                         return;
 514                 }
 515                 fown = &fa->fa_file->f_owner;
 516                 /* Don't send SIGURG to processes which have not set a
 517                    queued signum: SIGURG has its own default signalling
 518                    mechanism. */
 519                 if (fown->pid && !(sig == SIGURG && fown->signum == 0))
 520                         send_sigio(fown, fa->fa_fd, band);
 521                 fa = fa->fa_next;
 522         }
 523 }
 524
 525 void kill_fasync(struct fasync_struct **fp, int sig, int band)
 526 {
 527         read_lock(&fasync_lock);
 528         __kill_fasync(*fp, sig, band);
 529         read_unlock(&fasync_lock);
 530 }
 531
 532 static int __init fasync_init(void)
 533 {
 534         fasync_cache = kmem_cache_create("fasync_cache",
 535                 sizeof(struct fasync_struct), 0, 0, NULL, NULL);
 536         if (!fasync_cache)
 537                 panic("cannot create fasync slab cache");
 538         return 0;
 539 }
 540
 541 module_init(fasync_init)