toys/posix/sed.c

   1 /* sed.c - stream editor. Thing that does s/// and other stuff.
   2  *
   3  * Copyright 2014 Rob Landley <rob@landley.net>
   4  *
   5  * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
   6  *
   7  * TODO: lines > 2G could wrap signed int length counters. Not just getline()
   8  * but N and s///
   9  * TODO: make y// handle unicode
  10  * TODO: handle error return from emit(), error_msg/exit consistently
  11  *       What's the right thing to do for -i when write fails? Skip to next?
  12
  13 USE_SED(NEWTOY(sed, "(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE))
  14
  15 config SED
  16   bool "sed"
  17   default y
  18   help
  19     usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
  20
  21     Stream editor. Apply one or more editing SCRIPTs to each line of input
  22     (from FILE or stdin) producing output (by default to stdout).
  23
  24     -e  add SCRIPT to list
  25     -f  add contents of SCRIPT_FILE to list
  26     -i  Edit each file in place.
  27     -n  No default output. (Use the p command to output matched lines.)
  28     -r  Use extended regular expression syntax.
  29     -E  Alias for -r.
  30     -s  Treat input files separately (implied by -i)
  31
  32     A SCRIPT is a series of one or more COMMANDs separated by newlines or
  33     semicolons. All -e SCRIPTs are concatenated together as if separated
  34     by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
  35     If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
  36
  37     Each COMMAND may be preceded by an address which limits the command to
  38     apply only to the specified line(s). Commands without an address apply to
  39     every line. Addresses are of the form:
  40
  41       [ADDRESS[,ADDRESS]]COMMAND
  42
  43     The ADDRESS may be a decimal line number (starting at 1), a /regular
  44     expression/ within a pair of forward slashes, or the character "$" which
  45     matches the last line of input. (In -s or -i mode this matches the last
  46     line of each file, otherwise just the last line of the last file.) A single
  47     address matches one line, a pair of comma separated addresses match
  48     everything from the first address to the second address (inclusive). If
  49     both addresses are regular expressions, more than one range of lines in
  50     each file can match.
  51
  52     REGULAR EXPRESSIONS in sed are started and ended by the same character
  53     (traditionally / but anything except a backslash or a newline works).
  54     Backslashes may be used to escape the delimiter if it occurs in the
  55     regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
  56     and unicode). An empty regex repeats the previous one. ADDRESS regexes
  57     (above) require the first delimeter to be escaped with a backslash when
  58     it isn't a forward slash (to distinguish it from the COMMANDs below).
  59
  60     Sed mostly operates on individual lines one at a time. It reads each line,
  61     processes it, and either writes it to the output or discards it before
  62     reading the next line. Sed can remember one additional line in a separate
  63     buffer (using the h, H, g, G, and x commands), and can read the next line
  64     of input early (using the n and N command), but other than that command
  65     scripts operate on individual lines of text.
  66
  67     Each COMMAND starts with a single character. The following commands take
  68     no arguments:
  69
  70       {  Start a new command block, continuing until a corresponding "}".
  71          Command blocks may nest. If the block has an address, commands within
  72          the block are only run for lines within the block's address range.
  73
  74       }  End command block (this command cannot have an address)
  75
  76       d  Delete this line and move on to the next one
  77          (ignores remaining COMMANDs)
  78
  79       D  Delete one line of input and restart command SCRIPT (same as "d"
  80          unless you've glued lines together with "N" or similar)
  81
  82       g  Get remembered line (overwriting current line)
  83
  84       G  Get remembered line (appending to current line)
  85
  86       h  Remember this line (overwriting remembered line)
  87
  88       H  Remember this line (appending to remembered line, if any)
  89
  90       l  Print line, escaping \abfrtv (but not newline), octal escaping other
  91          nonprintable characters, wrapping lines to terminal width with a
  92          backslash, and appending $ to actual end of line.
  93
  94       n  Print default output and read next line, replacing current line
  95          (If no next line available, quit processing script)
  96
  97       N  Append next line of input to this line, separated by a newline
  98          (This advances the line counter for address matching and "=", if no
  99          next line available quit processing script without default output)
 100
 101       p  Print this line
 102
 103       P  Print this line up to first newline (from "N")
 104
 105       q  Quit (print default output, no more commands processed or lines read)
 106
 107       x  Exchange this line with remembered line (overwrite in both directions)
 108
 109       =  Print the current line number (followed by a newline)
 110
 111     The following commands (may) take an argument. The "text" arguments (to
 112     the "a", "b", and "c" commands) may end with an unescaped "\" to append
 113     the next line (for which leading whitespace is not skipped), and also
 114     treat ";" as a literal character (use "\;" instead).
 115
 116       a [text]   Append text to output before attempting to read next line
 117
 118       b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
 119
 120       c [text]   Delete line, output text at end of matching address range
 121                  (ignores remaining COMMANDs)
 122
 123       i [text]   Print text
 124
 125       r [file]   Append contents of file to output before attempting to read
 126                  next line.
 127
 128       s/S/R/F    Search for regex S, replace matched text with R using flags F.
 129                  The first character after the "s" (anything but newline or
 130                  backslash) is the delimiter, escape with \ to use normally.
 131
 132                  The replacement text may contain "&" to substitute the matched
 133                  text (escape it with backslash for a literal &), or \1 through
 134                  \9 to substitute a parenthetical subexpression in the regex.
 135                  You can also use the normal backslash escapes such as \n and
 136                  a backslash at the end of the line appends the next line.
 137
 138                  The flags are:
 139
 140                  [0-9]    A number, substitute only that occurrence of pattern
 141                  g        Global, substitute all occurrences of pattern
 142                  i        Ignore case when matching
 143                  p        Print the line if match was found and replaced
 144                  w [file] Write (append) line to file if match replaced
 145
 146       t [label]  Test, jump to :label only if an "s" command found a match in
 147                  this line since last test (replacing with same text counts)
 148
 149       T [label]  Test false, jump only if "s" hasn't found a match.
 150
 151       w [file]   Write (append) line to file
 152
 153       y/old/new/ Change each character in 'old' to corresponding character
 154                  in 'new' (with standard backslash escapes, delimiter can be
 155                  any repeated character except \ or \n)
 156
 157       : [label]  Labeled target for jump commands
 158
 159       #  Comment, ignore rest of this line of SCRIPT
 160
 161     Deviations from posix: allow extended regular expressions with -r,
 162     editing in place with -i, separate with -s, printf escapes in text, line
 163     continuations, semicolons after all commands, 2-address anywhere an
 164     address is allowed, "T" command, multiline continuations for [abc],
 165     \; to end [abc] argument before end of line.
 166 */
 167
 168 #define FOR_sed
 169 #include "toys.h"
 170
 171 GLOBALS(
 172   struct arg_list *f;
 173   struct arg_list *e;
 174
 175   // processed pattern list
 176   struct double_list *pattern;
 177
 178   char *nextline, *remember;
 179   void *restart, *lastregex;
 180   long nextlen, rememberlen, count;
 181   int fdout, noeol;
 182   unsigned xx;
 183 )
 184
 185 // Linked list of parsed sed commands. Offset fields indicate location where
 186 // regex or string starts, ala offset+(char *)struct, because we remalloc()
 187 // these to expand them for multiline inputs, and pointers would have to be
 188 // individually adjusted.
 189
 190 struct sedcmd {
 191   struct sedcmd *next, *prev;
 192
 193   // Begin and end of each match
 194   long lmatch[2]; // line number of match
 195   int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
 196   int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
 197   unsigned not, hit;
 198   unsigned sflags; // s///flag bits: i=1, g=2, p=4
 199   char c; // action
 200 };
 201
 202 // Write out line with potential embedded NUL, handling eol/noeol
 203 static int emit(char *line, long len, int eol)
 204 {
 205   int l, old = line[len];
 206
 207   if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
 208   TT.noeol = !eol;
 209   if (eol) line[len++] = '\n';
 210   if (!len) return 0;
 211   l = writeall(TT.fdout, line, len);
 212   if (eol) line[len-1] = old;
 213   if (l != len) {
 214     perror_msg("short write");
 215
 216     return 1;
 217   }
 218
 219   return 0;
 220 }
 221
 222 // Extend allocation to include new string, with newline between if newlen<0
 223
 224 static char *extend_string(char **old, char *new, int oldlen, int newlen)
 225 {
 226   int newline = newlen < 0;
 227   char *s;
 228
 229   if (newline) newlen = -newlen;
 230   s = *old = xrealloc(*old, oldlen+newlen+newline+1);
 231   if (newline) s[oldlen++] = '\n';
 232   memcpy(s+oldlen, new, newlen);
 233   s[oldlen+newlen] = 0;
 234
 235   return s+oldlen+newlen+1;
 236 }
 237
 238 // An empty regex repeats the previous one
 239 static void *get_regex(void *trump, int offset)
 240 {
 241   if (!offset) {
 242     if (!TT.lastregex) error_exit("no previous regex");
 243     return TT.lastregex;
 244   }
 245
 246   return TT.lastregex = offset+(char *)trump;
 247 }
 248
 249 // Apply pattern to line from input file
 250 static void process_line(char **pline, long plen)
 251 {
 252   struct append {
 253     struct append *next, *prev;
 254     int file;
 255     char *str;
 256   } *append = 0;
 257   char *line = TT.nextline;
 258   long len = TT.nextlen;
 259   struct sedcmd *command;
 260   int eol = 0, tea = 0;
 261
 262   // Grab next line for deferred processing (EOF detection: we get a NULL
 263   // pline at EOF to flush last line). Note that only end of _last_ input
 264   // file matches $ (unless we're doing -i).
 265   TT.nextline = 0;
 266   TT.nextlen = 0;
 267   if (pline) {
 268     TT.nextline = *pline;
 269     TT.nextlen = plen;
 270     *pline = 0;
 271   }
 272
 273   if (!line || !len) return;
 274   if (line[len-1] == '\n') line[--len] = eol++;
 275   TT.count++;
 276
 277   // The restart-1 is because we added one to make sure it wasn't NULL,
 278   // otherwise N as last command would restart script
 279   command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
 280   TT.restart = 0;
 281
 282   while (command) {
 283     char *str, c = command->c;
 284
 285     // Have we got a line or regex matching range for this rule?
 286     if (*command->lmatch || *command->rmatch) {
 287       int miss = 0;
 288       long lm;
 289
 290       // In a match that might end?
 291       if (command->hit) {
 292         if (!(lm = command->lmatch[1])) {
 293           if (!command->rmatch[1]) command->hit = 0;
 294           else {
 295             void *rm = get_regex(command, command->rmatch[1]);
 296
 297             // regex match end includes matching line, so defer deactivation
 298             if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
 299           }
 300         } else if (lm > 0 && lm < TT.count) command->hit = 0;
 301
 302       // Start a new match?
 303       } else {
 304         if (!(lm = *command->lmatch)) {
 305           void *rm = get_regex(command, *command->rmatch);
 306
 307           if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
 308         } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
 309
 310         if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
 311       }
 312
 313       // Didn't match?
 314       lm = !(command->hit ^ command->not);
 315
 316       // Deferred disable from regex end match
 317       if (miss || command->lmatch[1] == TT.count) command->hit = 0;
 318
 319       if (lm) {
 320         // Handle skipping curly bracket command group
 321         if (c == '{') {
 322           int curly = 1;
 323
 324           while (curly) {
 325             command = command->next;
 326             if (command->c == '{') curly++;
 327             if (command->c == '}') curly--;
 328           }
 329         }
 330         command = command->next;
 331         continue;
 332       }
 333     }
 334
 335     // A deleted line can still update line match state for later commands
 336     if (!line) {
 337       command = command->next;
 338       continue;
 339     }
 340
 341     // Process command
 342
 343     if (c=='a' || c=='r') {
 344       struct append *a = xzalloc(sizeof(struct append));
 345       if (command->arg1) a->str = command->arg1+(char *)command;
 346       a->file = c=='r';
 347       dlist_add_nomalloc((void *)&append, (void *)a);
 348     } else if (c=='b' || c=='t' || c=='T') {
 349       int t = tea;
 350
 351       if (c != 'b') tea = 0;
 352       if (c=='b' || t^(c=='T')) {
 353         if (!command->arg1) break;
 354         str = command->arg1+(char *)command;
 355         for (command = (void *)TT.pattern; command; command = command->next)
 356           if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
 357             break;
 358         if (!command) error_exit("no :%s", str);
 359       }
 360     } else if (c=='c') {
 361       str = command->arg1+(char *)command;
 362       if (!command->hit) emit(str, strlen(str), 1);
 363       free(line);
 364       line = 0;
 365       continue;
 366     } else if (c=='d') {
 367       free(line);
 368       line = 0;
 369       continue;
 370     } else if (c=='D') {
 371       // Delete up to \n or end of buffer
 372       str = line;
 373       while ((str-line)<len) if (*(str++) == '\n') break;
 374       len -= str - line;
 375       memmove(line, str, len);
 376
 377       // if "delete" blanks line, disable further processing
 378       // otherwise trim and restart script
 379       if (!len) {
 380         free(line);
 381         line = 0;
 382       } else {
 383         line[len] = 0;
 384         command = (void *)TT.pattern;
 385       }
 386       continue;
 387     } else if (c=='g') {
 388       free(line);
 389       line = xstrdup(TT.remember);
 390       len = TT.rememberlen;
 391     } else if (c=='G') {
 392       line = xrealloc(line, len+TT.rememberlen+2);
 393       line[len++] = '\n';
 394       memcpy(line+len, TT.remember, TT.rememberlen);
 395       line[len += TT.rememberlen] = 0;
 396     } else if (c=='h') {
 397       free(TT.remember);
 398       TT.remember = xstrdup(line);
 399       TT.rememberlen = len;
 400     } else if (c=='H') {
 401       TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
 402       TT.remember[TT.rememberlen++] = '\n';
 403       memcpy(TT.remember+TT.rememberlen, line, len);
 404       TT.remember[TT.rememberlen += len] = 0;
 405     } else if (c=='i') {
 406       str = command->arg1+(char *)command;
 407       emit(str, strlen(str), 1);
 408     } else if (c=='l') {
 409       int i, x, off;
 410
 411       if (!TT.xx) {
 412         terminal_size(&TT.xx, 0);
 413         if (!TT.xx) TT.xx = 80;
 414         if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
 415         if (TT.xx > 4) TT.xx -= 4;
 416       }
 417
 418       for (i = off = 0; i<len; i++) {
 419         if (off >= TT.xx) {
 420           toybuf[off++] = '\\';
 421           emit(toybuf, off, 1);
 422           off = 0;
 423         }
 424         x = stridx("\\\a\b\f\r\t\v", line[i]);
 425         if (x != -1) {
 426           toybuf[off++] = '\\';
 427           toybuf[off++] = "\\abfrtv"[x];
 428         } else if (line[i] >= ' ') toybuf[off++] = line[i];
 429         else off += sprintf(toybuf+off, "\\%03o", line[i]);
 430       }
 431       toybuf[off++] = '$';
 432       emit(toybuf, off, 1);
 433     } else if (c=='n') {
 434       TT.restart = command->next+1;
 435
 436       break;
 437     } else if (c=='N') {
 438       // Can't just grab next line because we could have multiple N and
 439       // we need to actually read ahead to get N;$p EOF detection right.
 440       if (pline) {
 441         TT.restart = command->next+1;
 442         extend_string(&line, TT.nextline, len, -TT.nextlen);
 443         free(TT.nextline);
 444         TT.nextline = line;
 445         TT.nextlen += len + 1;
 446         line = 0;
 447       }
 448
 449       // Pending append goes out right after N
 450       goto done;
 451     } else if (c=='p' || c=='P') {
 452       char *l = (c=='P') ? strchr(line, '\n') : 0;
 453
 454       if (emit(line, l ? l-line : len, eol)) break;
 455     } else if (c=='q') {
 456       if (pline) *pline = (void *)1;
 457       free(TT.nextline);
 458       TT.nextline = 0;
 459       TT.nextlen = 0;
 460
 461       break;
 462     } else if (c=='s') {
 463       char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
 464       regmatch_t *match = (void *)toybuf;
 465       regex_t *reg = get_regex(command, command->arg1);
 466       int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
 467
 468       // Find match in remaining line (up to remaining len)
 469       while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
 470         mflags = REG_NOTBOL;
 471
 472         // Zero length matches don't count immediately after a previous match
 473         mlen = match[0].rm_eo-match[0].rm_so;
 474         if (!mlen && !zmatch) {
 475           if (!rlen--) break;
 476           rline++;
 477           zmatch++;
 478           continue;
 479         } else zmatch = 0;
 480
 481         // If we're replacing only a specific match, skip if this isn't it
 482         off = command->sflags>>3;
 483         if (off && off != ++count) {
 484           rline += match[0].rm_eo;
 485           rlen -= match[0].rm_eo;
 486
 487           continue;
 488         }
 489         // The fact getline() can allocate unbounded amounts of memory is
 490         // a bigger issue, but while we're here check for integer overflow
 491         if (match[0].rm_eo > INT_MAX) perror_exit(0);
 492
 493         // newlen = strlen(new) but with \1 and & and printf escapes
 494         for (off = newlen = 0; new[off]; off++) {
 495           int cc = -1;
 496
 497           if (new[off] == '&') cc = 0;
 498           else if (new[off] == '\\') cc = new[++off] - '0';
 499           if (cc < 0 || cc > 9) {
 500             newlen++;
 501             continue;
 502           }
 503           newlen += match[cc].rm_eo-match[cc].rm_so;
 504         }
 505
 506         // Allocate new size, copy start/end around match. (Can't extend in
 507         // place because backrefs may refer to text after it's overwritten.)
 508         len += newlen-mlen;
 509         swap = xmalloc(len+1);
 510         rswap = swap+(rline-line)+match[0].rm_so;
 511         memcpy(swap, line, (rline-line)+match[0].rm_so);
 512         memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
 513
 514         // copy in new replacement text
 515         for (off = mlen = 0; new[off]; off++) {
 516           int cc = 0, ll;
 517
 518           if (new[off] == '\\') {
 519             cc = new[++off] - '0';
 520             if (cc<0 || cc>9) {
 521               if (!(rswap[mlen++] = unescape(new[off])))
 522                 rswap[mlen-1] = new[off];
 523
 524               continue;
 525             } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
 526           } else if (new[off] != '&') {
 527             rswap[mlen++] = new[off];
 528
 529             continue;
 530           }
 531
 532           ll = match[cc].rm_eo-match[cc].rm_so;
 533           memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
 534           mlen += ll;
 535         }
 536
 537         rline = rswap+newlen;
 538         free(line);
 539         line = swap;
 540
 541         // Stop after first substitution unless we have flag g
 542         if (!(command->sflags & 2)) break;
 543       }
 544
 545       if (mflags) {
 546         // flag p
 547         if (command->sflags & 4) emit(line, len, eol);
 548
 549         tea = 1;
 550         if (command->w) goto writenow;
 551       }
 552     } else if (c=='w') {
 553       int fd, noeol;
 554       char *name;
 555
 556 writenow:
 557       // Swap out emit() context
 558       fd = TT.fdout;
 559       noeol = TT.noeol;
 560
 561       // We save filehandle and newline status before filename
 562       name = command->w + (char *)command;
 563       memcpy(&TT.fdout, name, 4);
 564       name += 4;
 565       TT.noeol = *(name++);
 566
 567       // write, then save/restore context
 568       if (emit(line, len, eol))
 569         perror_exit("w '%s'", command->arg1+(char *)command);
 570       *(--name) = TT.noeol;
 571       TT.noeol = noeol;
 572       TT.fdout = fd;
 573     } else if (c=='x') {
 574       long swap = TT.rememberlen;
 575
 576       str = TT.remember;
 577       TT.remember = line;
 578       line = str;
 579       TT.rememberlen = len;
 580       len = swap;
 581     } else if (c=='y') {
 582       char *from, *to = (char *)command;
 583       int i, j;
 584
 585       from = to+command->arg1;
 586       to += command->arg2;
 587
 588       for (i = 0; i < len; i++) {
 589         j = stridx(from, line[i]);
 590         if (j != -1) line[i] = to[j];
 591       }
 592     } else if (c=='=') {
 593       sprintf(toybuf, "%ld", TT.count);
 594       emit(toybuf, strlen(toybuf), 1);
 595     }
 596
 597     command = command->next;
 598   }
 599
 600   if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
 601
 602 done:
 603   if (dlist_terminate(append)) while (append) {
 604     struct append *a = append->next;
 605
 606     if (append->file) {
 607       int fd = open(append->str, O_RDONLY);
 608
 609       // Force newline if noeol pending
 610       if (fd != -1) {
 611         if (TT.noeol) xwrite(TT.fdout, "\n", 1);
 612         TT.noeol = 0;
 613         xsendfile(fd, TT.fdout);
 614         close(fd);
 615       }
 616     } else if (append->str) emit(append->str, strlen(append->str), 1);
 617     else emit(line, 0, 0);
 618     free(append);
 619     append = a;
 620   }
 621   free(line);
 622 }
 623
 624 // Callback called on each input file
 625 static void do_sed(int fd, char *name)
 626 {
 627   int i = toys.optflags & FLAG_i;
 628   char *tmp;
 629
 630   if (i) {
 631     struct sedcmd *command;
 632
 633     if (!fd && !strcmp(name, "-")) {
 634       error_msg("-i on stdin");
 635       return;
 636     }
 637     TT.fdout = copy_tempfile(fd, name, &tmp);
 638     TT.count = 0;
 639     for (command = (void *)TT.pattern; command; command = command->next)
 640       command->hit = 0;
 641   }
 642   do_lines(fd, process_line);
 643   if (i) {
 644     process_line(0, 0);
 645     replace_tempfile(-1, TT.fdout, &tmp);
 646     TT.fdout = 1;
 647     TT.nextline = 0;
 648     TT.nextlen = TT.noeol = 0;
 649   }
 650 }
 651
 652 // Copy chunk of string between two delimiters, converting printf escapes.
 653 // returns processed copy of string (0 if error), *pstr advances to next
 654 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
 655 // if regxex, ignore delimiter in [ranges]
 656 static char *unescape_delimited_string(char **pstr, char *delim)
 657 {
 658   char *to, *from, mode = 0, d;
 659
 660   from = *pstr;
 661   if (!delim || !*delim) {
 662     if (!(d = *(from++))) return 0;
 663     if (d == '\\') d = *(from++);
 664     if (!d || d == '\\') return 0;
 665     if (delim) *delim = d;
 666   } else d = *delim;
 667   to = delim = xmalloc(strlen(*pstr)+1);
 668
 669   while (mode || *from != d) {
 670     if (!*from) return 0;
 671
 672     // delimiter in regex character range doesn't count
 673     if (!mode && *from == '[') {
 674       mode = '[';
 675       if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
 676     } else if (mode && *from == ']') mode = 0;
 677     // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
 678     // but the perl build does it, so we need to filter it out.
 679     else if (mode && *from == '-' && from[-1] == from[1]) {
 680       from+=2;
 681       continue;
 682     } else if (*from == '\\') {
 683       if (!from[1]) return 0;
 684
 685       // Check escaped end delimiter before printf style escapes.
 686       if (from[1] == d) from++;
 687       else if (from[1]=='\\') *(to++) = *(from++);
 688       else {
 689         char c = unescape(from[1]);
 690
 691         if (c) {
 692           *(to++) = c;
 693           from+=2;
 694           continue;
 695         } else if (!mode) *(to++) = *(from++);
 696       }
 697     }
 698     *(to++) = *(from++);
 699   }
 700   *to = 0;
 701   *pstr = from+1;
 702
 703   return delim;
 704 }
 705
 706 // Translate pattern strings into command structures. Each command structure
 707 // is a single allocation (which requires some math and remalloc at times).
 708 static void parse_pattern(char **pline, long len)
 709 {
 710   struct sedcmd *command = (void *)TT.pattern;
 711   char *line, *reg, c, *errstart;
 712   int i;
 713
 714   line = errstart = pline ? *pline : "";
 715   if (len && line[len-1]=='\n') line[--len] = 0;
 716
 717   // Append this line to previous multiline command? (hit indicates type.)
 718   // During parsing "hit" stores data about line continuations, but in
 719   // process_line() it means the match range attached to this command
 720   // is active, so processing the continuation must zero it again.
 721   if (command && command->prev->hit) {
 722     // Remove half-finished entry from list so remalloc() doesn't confuse it
 723     TT.pattern = TT.pattern->prev;
 724     command = dlist_pop(&TT.pattern);
 725     c = command->c;
 726     reg = (char *)command;
 727     reg += command->arg1 + strlen(reg + command->arg1);
 728
 729     // Resume parsing for 'a' or 's' command. (Only two that can do this.)
 730     // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
 731     // a unicode character.
 732     if (command->hit < 256) goto resume_s;
 733     else goto resume_a;
 734   }
 735
 736   // Loop through commands in this line.
 737
 738   command = 0;
 739   for (;;) {
 740     if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
 741
 742     // If there's no more data on this line, return.
 743     for (;;) {
 744       while (isspace(*line) || *line == ';') line++;
 745       if (*line == '#') while (*line && *line != '\n') line++;
 746       else break;
 747     }
 748     if (!*line) return;
 749
 750     // We start by writing data into toybuf. Later we'll allocate the
 751     // ex
 752
 753     errstart = line;
 754     memset(toybuf, 0, sizeof(struct sedcmd));
 755     command = (void *)toybuf;
 756     reg = toybuf + sizeof(struct sedcmd);
 757
 758     // Parse address range (if any)
 759     for (i = 0; i < 2; i++) {
 760       if (*line == ',') line++;
 761       else if (i) break;
 762
 763       if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
 764       else if (*line == '$') {
 765         command->lmatch[i] = -1;
 766         line++;
 767       } else if (*line == '/' || *line == '\\') {
 768         char *s = line;
 769
 770         if (!(s = unescape_delimited_string(&line, 0))) goto error;
 771         if (!*s) command->rmatch[i] = 0;
 772         else {
 773           xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
 774           command->rmatch[i] = reg-toybuf;
 775           reg += sizeof(regex_t);
 776         }
 777         free(s);
 778       } else break;
 779     }
 780
 781     while (isspace(*line)) line++;
 782     if (!*line) break;
 783
 784     while (*line == '!') {
 785       command->not = 1;
 786       line++;
 787     }
 788     while (isspace(*line)) line++;
 789
 790     c = command->c = *(line++);
 791     if (strchr("}:", c) && i) break;
 792     if (strchr("aiqr=", c) && i>1) break;
 793
 794     // Add step to pattern
 795     command = xmemdup(toybuf, reg-toybuf);
 796     reg = (reg-toybuf) + (char *)command;
 797
 798     // Parse arguments by command type
 799     if (c == '{') TT.nextlen++;
 800     else if (c == '}') {
 801       if (!TT.nextlen--) break;
 802     } else if (c == 's') {
 803       char *end, delim = 0;
 804
 805       // s/pattern/replacement/flags
 806
 807       // line continuations use arg1 (back at the start of the function),
 808       // so let's fill out arg2 first (since the regex part can't be multiple
 809       // lines) and swap them back later.
 810
 811       // get pattern (just record, we parse it later)
 812       command->arg2 = reg - (char *)command;
 813       if (!(TT.remember = unescape_delimited_string(&line, &delim)))
 814         goto error;
 815
 816       reg += sizeof(regex_t);
 817       command->arg1 = reg-(char *)command;
 818       command->hit = delim;
 819 resume_s:
 820       // get replacement - don't replace escapes yet because \1 and \& need
 821       // processing later, after we replace \\ with \ we can't tell \\1 from \1
 822       end = line;
 823       while (*end != command->hit) {
 824         if (!*end) goto error;
 825         if (*end++ == '\\') {
 826           if (!*end || *end == '\n') {
 827             end[-1] = '\n';
 828             break;
 829           }
 830           end++;
 831         }
 832       }
 833
 834       reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
 835       line = end;
 836       // line continuation? (note: '\n' can't be a valid delim).
 837       if (*line == command->hit) command->hit = 0;
 838       else {
 839         if (!*line) continue;
 840         reg--;
 841         line++;
 842         goto resume_s;
 843       }
 844
 845       // swap arg1/arg2 so they're back in order arguments occur.
 846       i = command->arg1;
 847       command->arg1 = command->arg2;
 848       command->arg2 = i;
 849
 850       // get flags
 851       for (line++; *line; line++) {
 852         long l;
 853
 854         if (isspace(*line) && *line != '\n') continue;
 855
 856         if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
 857         else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
 858           command->sflags |= l << 3;
 859           line--;
 860         } else break;
 861       }
 862
 863       // We deferred actually parsing the regex until we had the s///i flag
 864       // allocating the space was done by extend_string() above
 865       if (!*TT.remember) command->arg1 = 0;
 866       else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
 867         ((toys.optflags & FLAG_r)*REG_EXTENDED)|((command->sflags&1)*REG_ICASE));
 868       free(TT.remember);
 869       TT.remember = 0;
 870       if (*line == 'w') {
 871         line++;
 872         goto writenow;
 873       }
 874     } else if (c == 'w') {
 875       int fd, delim;
 876       char *cc;
 877
 878       // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
 879       // eol status, and to retain the filename for error messages, we'd need
 880       // to go up to arg5 just for this. Compromise: dynamically allocate the
 881       // filehandle and eol status.
 882
 883 writenow:
 884       while (isspace(*line)) line++;
 885       if (!*line) goto error;
 886       for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
 887       delim = *cc;
 888       *cc = 0;
 889       fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
 890       *cc = delim;
 891
 892       command->w = reg - (char *)command;
 893       command = xrealloc(command, command->w+(cc-line)+6);
 894       reg = command->w + (char *)command;
 895
 896       memcpy(reg, &fd, 4);
 897       reg += 4;
 898       *(reg++) = 0;
 899       memcpy(reg, line, delim);
 900       reg += delim;
 901       *(reg++) = 0;
 902
 903       line = cc;
 904       if (delim) line += 2;
 905     } else if (c == 'y') {
 906       char *s, delim = 0;
 907       int len;
 908
 909       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 910       command->arg1 = reg-(char *)command;
 911       len = strlen(s);
 912       reg = extend_string((void *)&command, s, reg-(char *)command, len);
 913       free(s);
 914       command->arg2 = reg-(char *)command;
 915       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 916       if (len != strlen(s)) goto error;
 917       reg = extend_string((void *)&command, s, reg-(char*)command, len);
 918       free(s);
 919     } else if (strchr("abcirtTw:", c)) {
 920       int end;
 921
 922       // trim leading spaces
 923       while (isspace(*line) && *line != '\n') line++;
 924
 925       // Resume logic differs from 's' case because we don't add a newline
 926       // unless it's after something, so we add it on return instead.
 927 resume_a:
 928       command->hit = 0;
 929
 930       // btT: end with space or semicolon, aicrw continue to newline.
 931       if (!(end = strcspn(line, strchr(":btT", c) ? "; \t\r\n\v\f" : "\n"))) {
 932         // Argument's optional for btT
 933         if (strchr("btT", c)) continue;
 934         else if (!command->arg1) break;
 935       }
 936
 937       // Extend allocation to include new string. We use offsets instead of
 938       // pointers so realloc() moving stuff doesn't break things. Ok to write
 939       // \n over NUL terminator because call to extend_string() adds it back.
 940       if (!command->arg1) command->arg1 = reg - (char*)command;
 941       else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
 942       else if (!pline) {
 943         command->arg1 = 0;
 944         continue;
 945       }
 946       reg = extend_string((void *)&command, line, reg - (char *)command, end);
 947
 948       // Recopy data to remove escape sequences and handle line continuation.
 949       if (strchr("aci", c)) {
 950         reg -= end+1;
 951         for (i = end; i; i--) {
 952           if ((*reg++ = *line++)=='\\') {
 953
 954             // escape at end of line: resume if -e escaped literal newline,
 955             // else request callback and resume with next line
 956             if (!--i) {
 957               *--reg = 0;
 958               if (*line) {
 959                 line++;
 960                 goto resume_a;
 961               }
 962               command->hit = 256;
 963               break;
 964             }
 965             if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
 966             line++;
 967           }
 968         }
 969         *reg = 0;
 970       } else line += end;
 971
 972     // Commands that take no arguments
 973     } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
 974   }
 975
 976 error:
 977   error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
 978 }
 979
 980 void sed_main(void)
 981 {
 982   struct arg_list *al;
 983   char **args = toys.optargs;
 984
 985   // Lie to autoconf when it asks stupid questions, so configure regexes
 986   // that look for "GNU sed version %f" greater than some old buggy number
 987   // don't fail us for not matching their narrow expectations.
 988   if (toys.optflags & FLAG_version) {
 989     xprintf("This is not GNU sed version 9.0\n");
 990     return;
 991   }
 992
 993   // Parse pattern into commands.
 994
 995   // If no -e or -f, first argument is the pattern.
 996   if (!TT.e && !TT.f) {
 997     if (!*toys.optargs) error_exit("no pattern");
 998     (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
 999   }
1000
1001   // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1002   // so handle all -e, then all -f. (At least the behavior's consistent.)
1003
1004   for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1005   for (al = TT.f; al; al = al->next) do_lines(xopenro(al->arg), parse_pattern);
1006   parse_pattern(0, 0);
1007   dlist_terminate(TT.pattern);
1008   if (TT.nextlen) error_exit("no }");
1009
1010   TT.fdout = 1;
1011   TT.remember = xstrdup("");
1012
1013   // Inflict pattern upon input files
1014   loopfiles_rw(args, O_RDONLY, 0, 0, do_sed);
1015
1016   if (!(toys.optflags & FLAG_i)) process_line(0, 0);
1017
1018   // todo: need to close fd when done for TOYBOX_FREE?
1019 }