1030 lines
32 KiB
C
1030 lines
32 KiB
C
/* sed.c - stream editor. Thing that does s/// and other stuff.
|
|
*
|
|
* Copyright 2014 Rob Landley <rob@landley.net>
|
|
*
|
|
* See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
|
|
*
|
|
* TODO: lines > 2G could wrap signed int length counters. Not just getline()
|
|
* but N and s///
|
|
* TODO: make y// handle unicode
|
|
* TODO: handle error return from emit(), error_msg/exit consistently
|
|
* What's the right thing to do for -i when write fails? Skip to next?
|
|
|
|
USE_SED(NEWTOY(sed, "(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE))
|
|
|
|
config SED
|
|
bool "sed"
|
|
default y
|
|
help
|
|
usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
|
|
|
|
Stream editor. Apply one or more editing SCRIPTs to each line of input
|
|
(from FILE or stdin) producing output (by default to stdout).
|
|
|
|
-e add SCRIPT to list
|
|
-f add contents of SCRIPT_FILE to list
|
|
-i Edit each file in place.
|
|
-n No default output. (Use the p command to output matched lines.)
|
|
-r Use extended regular expression syntax.
|
|
-E Alias for -r.
|
|
-s Treat input files separately (implied by -i)
|
|
|
|
A SCRIPT is a series of one or more COMMANDs separated by newlines or
|
|
semicolons. All -e SCRIPTs are concatenated together as if separated
|
|
by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
|
|
If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
|
|
|
|
Each COMMAND may be preceded by an address which limits the command to
|
|
apply only to the specified line(s). Commands without an address apply to
|
|
every line. Addresses are of the form:
|
|
|
|
[ADDRESS[,ADDRESS]]COMMAND
|
|
|
|
The ADDRESS may be a decimal line number (starting at 1), a /regular
|
|
expression/ within a pair of forward slashes, or the character "$" which
|
|
matches the last line of input. (In -s or -i mode this matches the last
|
|
line of each file, otherwise just the last line of the last file.) A single
|
|
address matches one line, a pair of comma separated addresses match
|
|
everything from the first address to the second address (inclusive). If
|
|
both addresses are regular expressions, more than one range of lines in
|
|
each file can match.
|
|
|
|
REGULAR EXPRESSIONS in sed are started and ended by the same character
|
|
(traditionally / but anything except a backslash or a newline works).
|
|
Backslashes may be used to escape the delimiter if it occurs in the
|
|
regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
|
|
and unicode). An empty regex repeats the previous one. ADDRESS regexes
|
|
(above) require the first delimeter to be escaped with a backslash when
|
|
it isn't a forward slash (to distinguish it from the COMMANDs below).
|
|
|
|
Sed mostly operates on individual lines one at a time. It reads each line,
|
|
processes it, and either writes it to the output or discards it before
|
|
reading the next line. Sed can remember one additional line in a separate
|
|
buffer (using the h, H, g, G, and x commands), and can read the next line
|
|
of input early (using the n and N command), but other than that command
|
|
scripts operate on individual lines of text.
|
|
|
|
Each COMMAND starts with a single character. The following commands take
|
|
no arguments:
|
|
|
|
{ Start a new command block, continuing until a corresponding "}".
|
|
Command blocks may nest. If the block has an address, commands within
|
|
the block are only run for lines within the block's address range.
|
|
|
|
} End command block (this command cannot have an address)
|
|
|
|
d Delete this line and move on to the next one
|
|
(ignores remaining COMMANDs)
|
|
|
|
D Delete one line of input and restart command SCRIPT (same as "d"
|
|
unless you've glued lines together with "N" or similar)
|
|
|
|
g Get remembered line (overwriting current line)
|
|
|
|
G Get remembered line (appending to current line)
|
|
|
|
h Remember this line (overwriting remembered line)
|
|
|
|
H Remember this line (appending to remembered line, if any)
|
|
|
|
l Print line, escaping \abfrtv (but not newline), octal escaping other
|
|
nonprintable characters, wrapping lines to terminal width with a
|
|
backslash, and appending $ to actual end of line.
|
|
|
|
n Print default output and read next line, replacing current line
|
|
(If no next line available, quit processing script)
|
|
|
|
N Append next line of input to this line, separated by a newline
|
|
(This advances the line counter for address matching and "=", if no
|
|
next line available quit processing script without default output)
|
|
|
|
p Print this line
|
|
|
|
P Print this line up to first newline (from "N")
|
|
|
|
q Quit (print default output, no more commands processed or lines read)
|
|
|
|
x Exchange this line with remembered line (overwrite in both directions)
|
|
|
|
= Print the current line number (followed by a newline)
|
|
|
|
The following commands (may) take an argument. The "text" arguments (to
|
|
the "a", "b", and "c" commands) may end with an unescaped "\" to append
|
|
the next line (for which leading whitespace is not skipped), and also
|
|
treat ";" as a literal character (use "\;" instead).
|
|
|
|
a [text] Append text to output before attempting to read next line
|
|
|
|
b [label] Branch, jumps to :label (or with no label, to end of SCRIPT)
|
|
|
|
c [text] Delete line, output text at end of matching address range
|
|
(ignores remaining COMMANDs)
|
|
|
|
i [text] Print text
|
|
|
|
r [file] Append contents of file to output before attempting to read
|
|
next line.
|
|
|
|
s/S/R/F Search for regex S, replace matched text with R using flags F.
|
|
The first character after the "s" (anything but newline or
|
|
backslash) is the delimiter, escape with \ to use normally.
|
|
|
|
The replacement text may contain "&" to substitute the matched
|
|
text (escape it with backslash for a literal &), or \1 through
|
|
\9 to substitute a parenthetical subexpression in the regex.
|
|
You can also use the normal backslash escapes such as \n and
|
|
a backslash at the end of the line appends the next line.
|
|
|
|
The flags are:
|
|
|
|
[0-9] A number, substitute only that occurrence of pattern
|
|
g Global, substitute all occurrences of pattern
|
|
i Ignore case when matching
|
|
p Print the line if match was found and replaced
|
|
w [file] Write (append) line to file if match replaced
|
|
|
|
t [label] Test, jump to :label only if an "s" command found a match in
|
|
this line since last test (replacing with same text counts)
|
|
|
|
T [label] Test false, jump only if "s" hasn't found a match.
|
|
|
|
w [file] Write (append) line to file
|
|
|
|
y/old/new/ Change each character in 'old' to corresponding character
|
|
in 'new' (with standard backslash escapes, delimiter can be
|
|
any repeated character except \ or \n)
|
|
|
|
: [label] Labeled target for jump commands
|
|
|
|
# Comment, ignore rest of this line of SCRIPT
|
|
|
|
Deviations from posix: allow extended regular expressions with -r,
|
|
editing in place with -i, separate with -s, printf escapes in text, line
|
|
continuations, semicolons after all commands, 2-address anywhere an
|
|
address is allowed, "T" command, multiline continuations for [abc],
|
|
\; to end [abc] argument before end of line.
|
|
*/
|
|
|
|
#define FOR_sed
|
|
#include "toys.h"
|
|
|
|
GLOBALS(
|
|
struct arg_list *f;
|
|
struct arg_list *e;
|
|
|
|
// processed pattern list
|
|
struct double_list *pattern;
|
|
|
|
char *nextline, *remember;
|
|
void *restart, *lastregex;
|
|
long nextlen, rememberlen, count;
|
|
int fdout, noeol;
|
|
unsigned xx;
|
|
)
|
|
|
|
// Linked list of parsed sed commands. Offset fields indicate location where
|
|
// regex or string starts, ala offset+(char *)struct, because we remalloc()
|
|
// these to expand them for multiline inputs, and pointers would have to be
|
|
// individually adjusted.
|
|
|
|
struct sedcmd {
|
|
struct sedcmd *next, *prev;
|
|
|
|
// Begin and end of each match
|
|
long lmatch[2]; // line number of match
|
|
int rmatch[2]; // offset of regex struct for prefix matches (/abc/,/def/p)
|
|
int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
|
|
unsigned not, hit;
|
|
unsigned sflags; // s///flag bits: i=1, g=2, p=4
|
|
char c; // action
|
|
};
|
|
|
|
// Write out line with potential embedded NUL, handling eol/noeol
|
|
static int emit(char *line, long len, int eol)
|
|
{
|
|
int l, old = line[len];
|
|
|
|
if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
|
|
TT.noeol = !eol;
|
|
if (eol) line[len++] = '\n';
|
|
if (!len) return 0;
|
|
l = writeall(TT.fdout, line, len);
|
|
if (eol) line[len-1] = old;
|
|
if (l != len) {
|
|
perror_msg("short write");
|
|
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
// Extend allocation to include new string, with newline between if newlen<0
|
|
|
|
static char *extend_string(char **old, char *new, int oldlen, int newlen)
|
|
{
|
|
int newline = newlen < 0;
|
|
char *s;
|
|
|
|
if (newline) newlen = -newlen;
|
|
s = *old = xrealloc(*old, oldlen+newlen+newline+1);
|
|
if (newline) s[oldlen++] = '\n';
|
|
memcpy(s+oldlen, new, newlen);
|
|
s[oldlen+newlen] = 0;
|
|
|
|
return s+oldlen+newlen+1;
|
|
}
|
|
|
|
// An empty regex repeats the previous one
|
|
static void *get_regex(void *trump, int offset)
|
|
{
|
|
if (!offset) {
|
|
if (!TT.lastregex) error_exit("no previous regex");
|
|
return TT.lastregex;
|
|
}
|
|
|
|
return TT.lastregex = offset+(char *)trump;
|
|
}
|
|
|
|
// Apply pattern to line from input file
|
|
static void process_line(char **pline, long plen)
|
|
{
|
|
struct append {
|
|
struct append *next, *prev;
|
|
int file;
|
|
char *str;
|
|
} *append = 0;
|
|
char *line = TT.nextline;
|
|
long len = TT.nextlen;
|
|
struct sedcmd *command;
|
|
int eol = 0, tea = 0;
|
|
|
|
// Grab next line for deferred processing (EOF detection: we get a NULL
|
|
// pline at EOF to flush last line). Note that only end of _last_ input
|
|
// file matches $ (unless we're doing -i).
|
|
TT.nextline = 0;
|
|
TT.nextlen = 0;
|
|
if (pline) {
|
|
TT.nextline = *pline;
|
|
TT.nextlen = plen;
|
|
*pline = 0;
|
|
}
|
|
|
|
if (!line || !len) return;
|
|
if (line[len-1] == '\n') line[--len] = eol++;
|
|
TT.count++;
|
|
|
|
// The restart-1 is because we added one to make sure it wasn't NULL,
|
|
// otherwise N as last command would restart script
|
|
command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
|
|
TT.restart = 0;
|
|
|
|
while (command) {
|
|
char *str, c = command->c;
|
|
|
|
// Have we got a line or regex matching range for this rule?
|
|
if (*command->lmatch || *command->rmatch) {
|
|
int miss = 0;
|
|
long lm;
|
|
|
|
// In a match that might end?
|
|
if (command->hit) {
|
|
if (!(lm = command->lmatch[1])) {
|
|
if (!command->rmatch[1]) command->hit = 0;
|
|
else {
|
|
void *rm = get_regex(command, command->rmatch[1]);
|
|
|
|
// regex match end includes matching line, so defer deactivation
|
|
if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
|
|
}
|
|
} else if (lm > 0 && lm < TT.count) command->hit = 0;
|
|
|
|
// Start a new match?
|
|
} else {
|
|
if (!(lm = *command->lmatch)) {
|
|
void *rm = get_regex(command, *command->rmatch);
|
|
|
|
if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
|
|
} else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
|
|
|
|
if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
|
|
}
|
|
|
|
// Didn't match?
|
|
lm = !(command->hit ^ command->not);
|
|
|
|
// Deferred disable from regex end match
|
|
if (miss || command->lmatch[1] == TT.count) command->hit = 0;
|
|
|
|
if (lm) {
|
|
// Handle skipping curly bracket command group
|
|
if (c == '{') {
|
|
int curly = 1;
|
|
|
|
while (curly) {
|
|
command = command->next;
|
|
if (command->c == '{') curly++;
|
|
if (command->c == '}') curly--;
|
|
}
|
|
}
|
|
command = command->next;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// A deleted line can still update line match state for later commands
|
|
if (!line) {
|
|
command = command->next;
|
|
continue;
|
|
}
|
|
|
|
// Process command
|
|
|
|
if (c=='a' || c=='r') {
|
|
struct append *a = xzalloc(sizeof(struct append));
|
|
if (command->arg1) a->str = command->arg1+(char *)command;
|
|
a->file = c=='r';
|
|
dlist_add_nomalloc((void *)&append, (void *)a);
|
|
} else if (c=='b' || c=='t' || c=='T') {
|
|
int t = tea;
|
|
|
|
if (c != 'b') tea = 0;
|
|
if (c=='b' || t^(c=='T')) {
|
|
if (!command->arg1) break;
|
|
str = command->arg1+(char *)command;
|
|
for (command = (void *)TT.pattern; command; command = command->next)
|
|
if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
|
|
break;
|
|
if (!command) error_exit("no :%s", str);
|
|
}
|
|
} else if (c=='c') {
|
|
str = command->arg1+(char *)command;
|
|
if (!command->hit) emit(str, strlen(str), 1);
|
|
free(line);
|
|
line = 0;
|
|
continue;
|
|
} else if (c=='d') {
|
|
free(line);
|
|
line = 0;
|
|
continue;
|
|
} else if (c=='D') {
|
|
// Delete up to \n or end of buffer
|
|
str = line;
|
|
while ((str-line)<len) if (*(str++) == '\n') break;
|
|
len -= str - line;
|
|
memmove(line, str, len);
|
|
|
|
// if "delete" blanks line, disable further processing
|
|
// otherwise trim and restart script
|
|
if (!len) {
|
|
free(line);
|
|
line = 0;
|
|
} else {
|
|
line[len] = 0;
|
|
command = (void *)TT.pattern;
|
|
}
|
|
continue;
|
|
} else if (c=='g') {
|
|
free(line);
|
|
line = xstrdup(TT.remember);
|
|
len = TT.rememberlen;
|
|
} else if (c=='G') {
|
|
line = xrealloc(line, len+TT.rememberlen+2);
|
|
line[len++] = '\n';
|
|
memcpy(line+len, TT.remember, TT.rememberlen);
|
|
line[len += TT.rememberlen] = 0;
|
|
} else if (c=='h') {
|
|
free(TT.remember);
|
|
TT.remember = xstrdup(line);
|
|
TT.rememberlen = len;
|
|
} else if (c=='H') {
|
|
TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
|
|
TT.remember[TT.rememberlen++] = '\n';
|
|
memcpy(TT.remember+TT.rememberlen, line, len);
|
|
TT.remember[TT.rememberlen += len] = 0;
|
|
} else if (c=='i') {
|
|
str = command->arg1+(char *)command;
|
|
emit(str, strlen(str), 1);
|
|
} else if (c=='l') {
|
|
int i, x, off;
|
|
|
|
if (!TT.xx) {
|
|
terminal_size(&TT.xx, 0);
|
|
if (!TT.xx) TT.xx = 80;
|
|
if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
|
|
if (TT.xx > 4) TT.xx -= 4;
|
|
}
|
|
|
|
for (i = off = 0; i<len; i++) {
|
|
if (off >= TT.xx) {
|
|
toybuf[off++] = '\\';
|
|
emit(toybuf, off, 1);
|
|
off = 0;
|
|
}
|
|
x = stridx("\\\a\b\f\r\t\v", line[i]);
|
|
if (x != -1) {
|
|
toybuf[off++] = '\\';
|
|
toybuf[off++] = "\\abfrtv"[x];
|
|
} else if (line[i] >= ' ') toybuf[off++] = line[i];
|
|
else off += sprintf(toybuf+off, "\\%03o", line[i]);
|
|
}
|
|
toybuf[off++] = '$';
|
|
emit(toybuf, off, 1);
|
|
} else if (c=='n') {
|
|
TT.restart = command->next+1;
|
|
|
|
break;
|
|
} else if (c=='N') {
|
|
// Can't just grab next line because we could have multiple N and
|
|
// we need to actually read ahead to get N;$p EOF detection right.
|
|
if (pline) {
|
|
TT.restart = command->next+1;
|
|
extend_string(&line, TT.nextline, len, -TT.nextlen);
|
|
free(TT.nextline);
|
|
TT.nextline = line;
|
|
TT.nextlen += len + 1;
|
|
line = 0;
|
|
}
|
|
|
|
// Pending append goes out right after N
|
|
goto done;
|
|
} else if (c=='p' || c=='P') {
|
|
char *l = (c=='P') ? strchr(line, '\n') : 0;
|
|
|
|
if (emit(line, l ? l-line : len, eol)) break;
|
|
} else if (c=='q') {
|
|
if (pline) *pline = (void *)1;
|
|
free(TT.nextline);
|
|
TT.nextline = 0;
|
|
TT.nextlen = 0;
|
|
|
|
break;
|
|
} else if (c=='s') {
|
|
char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
|
|
regmatch_t *match = (void *)toybuf;
|
|
regex_t *reg = get_regex(command, command->arg1);
|
|
int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
|
|
|
|
// Find match in remaining line (up to remaining len)
|
|
while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
|
|
mflags = REG_NOTBOL;
|
|
|
|
// Zero length matches don't count immediately after a previous match
|
|
mlen = match[0].rm_eo-match[0].rm_so;
|
|
if (!mlen && !zmatch) {
|
|
if (!rlen--) break;
|
|
rline++;
|
|
zmatch++;
|
|
continue;
|
|
} else zmatch = 0;
|
|
|
|
// If we're replacing only a specific match, skip if this isn't it
|
|
off = command->sflags>>3;
|
|
if (off && off != ++count) {
|
|
rline += match[0].rm_eo;
|
|
rlen -= match[0].rm_eo;
|
|
|
|
continue;
|
|
}
|
|
// The fact getline() can allocate unbounded amounts of memory is
|
|
// a bigger issue, but while we're here check for integer overflow
|
|
if (match[0].rm_eo > INT_MAX) perror_exit(0);
|
|
|
|
// newlen = strlen(new) but with \1 and & and printf escapes
|
|
for (off = newlen = 0; new[off]; off++) {
|
|
int cc = -1;
|
|
|
|
if (new[off] == '&') cc = 0;
|
|
else if (new[off] == '\\') cc = new[++off] - '0';
|
|
if (cc < 0 || cc > 9) {
|
|
newlen++;
|
|
continue;
|
|
}
|
|
newlen += match[cc].rm_eo-match[cc].rm_so;
|
|
}
|
|
|
|
// Allocate new size, copy start/end around match. (Can't extend in
|
|
// place because backrefs may refer to text after it's overwritten.)
|
|
len += newlen-mlen;
|
|
swap = xmalloc(len+1);
|
|
rswap = swap+(rline-line)+match[0].rm_so;
|
|
memcpy(swap, line, (rline-line)+match[0].rm_so);
|
|
memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
|
|
|
|
// copy in new replacement text
|
|
for (off = mlen = 0; new[off]; off++) {
|
|
int cc = 0, ll;
|
|
|
|
if (new[off] == '\\') {
|
|
cc = new[++off] - '0';
|
|
if (cc<0 || cc>9) {
|
|
if (!(rswap[mlen++] = unescape(new[off])))
|
|
rswap[mlen-1] = new[off];
|
|
|
|
continue;
|
|
} else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
|
|
} else if (new[off] != '&') {
|
|
rswap[mlen++] = new[off];
|
|
|
|
continue;
|
|
}
|
|
|
|
ll = match[cc].rm_eo-match[cc].rm_so;
|
|
memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
|
|
mlen += ll;
|
|
}
|
|
|
|
rline = rswap+newlen;
|
|
free(line);
|
|
line = swap;
|
|
|
|
// Stop after first substitution unless we have flag g
|
|
if (!(command->sflags & 2)) break;
|
|
}
|
|
|
|
if (mflags) {
|
|
// flag p
|
|
if (command->sflags & 4) emit(line, len, eol);
|
|
|
|
tea = 1;
|
|
if (command->w) goto writenow;
|
|
}
|
|
} else if (c=='w') {
|
|
int fd, noeol;
|
|
char *name;
|
|
|
|
writenow:
|
|
// Swap out emit() context
|
|
fd = TT.fdout;
|
|
noeol = TT.noeol;
|
|
|
|
// We save filehandle and newline status before filename
|
|
name = command->w + (char *)command;
|
|
memcpy(&TT.fdout, name, 4);
|
|
name += 4;
|
|
TT.noeol = *(name++);
|
|
|
|
// write, then save/restore context
|
|
if (emit(line, len, eol))
|
|
perror_exit("w '%s'", command->arg1+(char *)command);
|
|
*(--name) = TT.noeol;
|
|
TT.noeol = noeol;
|
|
TT.fdout = fd;
|
|
} else if (c=='x') {
|
|
long swap = TT.rememberlen;
|
|
|
|
str = TT.remember;
|
|
TT.remember = line;
|
|
line = str;
|
|
TT.rememberlen = len;
|
|
len = swap;
|
|
} else if (c=='y') {
|
|
char *from, *to = (char *)command;
|
|
int i, j;
|
|
|
|
from = to+command->arg1;
|
|
to += command->arg2;
|
|
|
|
for (i = 0; i < len; i++) {
|
|
j = stridx(from, line[i]);
|
|
if (j != -1) line[i] = to[j];
|
|
}
|
|
} else if (c=='=') {
|
|
sprintf(toybuf, "%ld", TT.count);
|
|
emit(toybuf, strlen(toybuf), 1);
|
|
}
|
|
|
|
command = command->next;
|
|
}
|
|
|
|
if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
|
|
|
|
done:
|
|
if (dlist_terminate(append)) while (append) {
|
|
struct append *a = append->next;
|
|
|
|
if (append->file) {
|
|
int fd = open(append->str, O_RDONLY);
|
|
|
|
// Force newline if noeol pending
|
|
if (fd != -1) {
|
|
if (TT.noeol) xwrite(TT.fdout, "\n", 1);
|
|
TT.noeol = 0;
|
|
xsendfile(fd, TT.fdout);
|
|
close(fd);
|
|
}
|
|
} else if (append->str) emit(append->str, strlen(append->str), 1);
|
|
else emit(line, 0, 0);
|
|
free(append);
|
|
append = a;
|
|
}
|
|
free(line);
|
|
}
|
|
|
|
// Callback called on each input file
|
|
static void do_sed(int fd, char *name)
|
|
{
|
|
int i = toys.optflags & FLAG_i;
|
|
char *tmp;
|
|
|
|
if (i) {
|
|
struct sedcmd *command;
|
|
|
|
if (!fd && !strcmp(name, "-")) {
|
|
error_msg("-i on stdin");
|
|
return;
|
|
}
|
|
TT.fdout = copy_tempfile(fd, name, &tmp);
|
|
TT.count = 0;
|
|
for (command = (void *)TT.pattern; command; command = command->next)
|
|
command->hit = 0;
|
|
}
|
|
do_lines(fd, process_line);
|
|
if (i) {
|
|
process_line(0, 0);
|
|
replace_tempfile(-1, TT.fdout, &tmp);
|
|
TT.fdout = 1;
|
|
TT.nextline = 0;
|
|
TT.nextlen = TT.noeol = 0;
|
|
}
|
|
}
|
|
|
|
// Copy chunk of string between two delimiters, converting printf escapes.
|
|
// returns processed copy of string (0 if error), *pstr advances to next
|
|
// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
|
|
// if regxex, ignore delimiter in [ranges]
|
|
static char *unescape_delimited_string(char **pstr, char *delim)
|
|
{
|
|
char *to, *from, mode = 0, d;
|
|
|
|
// Grab leading delimiter (if necessary), allocate space for new string
|
|
from = *pstr;
|
|
if (!delim || !*delim) {
|
|
if (!(d = *(from++))) return 0;
|
|
if (d == '\\') d = *(from++);
|
|
if (!d || d == '\\') return 0;
|
|
if (delim) *delim = d;
|
|
} else d = *delim;
|
|
to = delim = xmalloc(strlen(*pstr)+1);
|
|
|
|
while (mode || *from != d) {
|
|
if (!*from) return 0;
|
|
|
|
// delimiter in regex character range doesn't count
|
|
if (*from == '[') {
|
|
if (!mode) {
|
|
mode = ']';
|
|
if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
|
|
} else if (mode == ']' && strchr(".=:", from[1])) {
|
|
*(to++) = *(from++);
|
|
mode = *from;
|
|
}
|
|
} else if (*from == mode) {
|
|
if (mode == ']') mode = 0;
|
|
else {
|
|
*(to++) = *(from++);
|
|
mode = ']';
|
|
}
|
|
// Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
|
|
// but the perl build does it, so we need to filter it out.
|
|
} else if (mode && *from == '-' && from[-1] == from[1]) {
|
|
from+=2;
|
|
continue;
|
|
} else if (*from == '\\') {
|
|
if (!from[1]) return 0;
|
|
|
|
// Check escaped end delimiter before printf style escapes.
|
|
if (from[1] == d) from++;
|
|
else if (from[1]=='\\') *(to++) = *(from++);
|
|
else {
|
|
char c = unescape(from[1]);
|
|
|
|
if (c) {
|
|
*(to++) = c;
|
|
from+=2;
|
|
continue;
|
|
} else if (!mode) *(to++) = *(from++);
|
|
}
|
|
}
|
|
*(to++) = *(from++);
|
|
}
|
|
*to = 0;
|
|
*pstr = from+1;
|
|
|
|
return delim;
|
|
}
|
|
|
|
// Translate pattern strings into command structures. Each command structure
|
|
// is a single allocation (which requires some math and remalloc at times).
|
|
static void parse_pattern(char **pline, long len)
|
|
{
|
|
struct sedcmd *command = (void *)TT.pattern;
|
|
char *line, *reg, c, *errstart;
|
|
int i;
|
|
|
|
line = errstart = pline ? *pline : "";
|
|
if (len && line[len-1]=='\n') line[--len] = 0;
|
|
|
|
// Append this line to previous multiline command? (hit indicates type.)
|
|
// During parsing "hit" stores data about line continuations, but in
|
|
// process_line() it means the match range attached to this command
|
|
// is active, so processing the continuation must zero it again.
|
|
if (command && command->prev->hit) {
|
|
// Remove half-finished entry from list so remalloc() doesn't confuse it
|
|
TT.pattern = TT.pattern->prev;
|
|
command = dlist_pop(&TT.pattern);
|
|
c = command->c;
|
|
reg = (char *)command;
|
|
reg += command->arg1 + strlen(reg + command->arg1);
|
|
|
|
// Resume parsing for 'a' or 's' command. (Only two that can do this.)
|
|
// TODO: using 256 to indicate 'a' means our s/// delimiter can't be
|
|
// a unicode character.
|
|
if (command->hit < 256) goto resume_s;
|
|
else goto resume_a;
|
|
}
|
|
|
|
// Loop through commands in this line.
|
|
|
|
command = 0;
|
|
for (;;) {
|
|
if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
|
|
|
|
// If there's no more data on this line, return.
|
|
for (;;) {
|
|
while (isspace(*line) || *line == ';') line++;
|
|
if (*line == '#') while (*line && *line != '\n') line++;
|
|
else break;
|
|
}
|
|
if (!*line) return;
|
|
|
|
// We start by writing data into toybuf. Later we'll allocate the
|
|
// ex
|
|
|
|
errstart = line;
|
|
memset(toybuf, 0, sizeof(struct sedcmd));
|
|
command = (void *)toybuf;
|
|
reg = toybuf + sizeof(struct sedcmd);
|
|
|
|
// Parse address range (if any)
|
|
for (i = 0; i < 2; i++) {
|
|
if (*line == ',') line++;
|
|
else if (i) break;
|
|
|
|
if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
|
|
else if (*line == '$') {
|
|
command->lmatch[i] = -1;
|
|
line++;
|
|
} else if (*line == '/' || *line == '\\') {
|
|
char *s = line;
|
|
|
|
if (!(s = unescape_delimited_string(&line, 0))) goto error;
|
|
if (!*s) command->rmatch[i] = 0;
|
|
else {
|
|
xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
|
|
command->rmatch[i] = reg-toybuf;
|
|
reg += sizeof(regex_t);
|
|
}
|
|
free(s);
|
|
} else break;
|
|
}
|
|
|
|
while (isspace(*line)) line++;
|
|
if (!*line) break;
|
|
|
|
while (*line == '!') {
|
|
command->not = 1;
|
|
line++;
|
|
}
|
|
while (isspace(*line)) line++;
|
|
|
|
c = command->c = *(line++);
|
|
if (strchr("}:", c) && i) break;
|
|
if (strchr("aiqr=", c) && i>1) break;
|
|
|
|
// Add step to pattern
|
|
command = xmemdup(toybuf, reg-toybuf);
|
|
reg = (reg-toybuf) + (char *)command;
|
|
|
|
// Parse arguments by command type
|
|
if (c == '{') TT.nextlen++;
|
|
else if (c == '}') {
|
|
if (!TT.nextlen--) break;
|
|
} else if (c == 's') {
|
|
char *end, delim = 0;
|
|
|
|
// s/pattern/replacement/flags
|
|
|
|
// line continuations use arg1 (back at the start of the function),
|
|
// so let's fill out arg2 first (since the regex part can't be multiple
|
|
// lines) and swap them back later.
|
|
|
|
// get pattern (just record, we parse it later)
|
|
command->arg2 = reg - (char *)command;
|
|
if (!(TT.remember = unescape_delimited_string(&line, &delim)))
|
|
goto error;
|
|
|
|
reg += sizeof(regex_t);
|
|
command->arg1 = reg-(char *)command;
|
|
command->hit = delim;
|
|
resume_s:
|
|
// get replacement - don't replace escapes yet because \1 and \& need
|
|
// processing later, after we replace \\ with \ we can't tell \\1 from \1
|
|
end = line;
|
|
while (*end != command->hit) {
|
|
if (!*end) goto error;
|
|
if (*end++ == '\\') {
|
|
if (!*end || *end == '\n') {
|
|
end[-1] = '\n';
|
|
break;
|
|
}
|
|
end++;
|
|
}
|
|
}
|
|
|
|
reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
|
|
line = end;
|
|
// line continuation? (note: '\n' can't be a valid delim).
|
|
if (*line == command->hit) command->hit = 0;
|
|
else {
|
|
if (!*line) continue;
|
|
reg--;
|
|
line++;
|
|
goto resume_s;
|
|
}
|
|
|
|
// swap arg1/arg2 so they're back in order arguments occur.
|
|
i = command->arg1;
|
|
command->arg1 = command->arg2;
|
|
command->arg2 = i;
|
|
|
|
// get flags
|
|
for (line++; *line; line++) {
|
|
long l;
|
|
|
|
if (isspace(*line) && *line != '\n') continue;
|
|
|
|
if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
|
|
else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
|
|
command->sflags |= l << 3;
|
|
line--;
|
|
} else break;
|
|
}
|
|
|
|
// We deferred actually parsing the regex until we had the s///i flag
|
|
// allocating the space was done by extend_string() above
|
|
if (!*TT.remember) command->arg1 = 0;
|
|
else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
|
|
((toys.optflags & FLAG_r)*REG_EXTENDED)|((command->sflags&1)*REG_ICASE));
|
|
free(TT.remember);
|
|
TT.remember = 0;
|
|
if (*line == 'w') {
|
|
line++;
|
|
goto writenow;
|
|
}
|
|
} else if (c == 'w') {
|
|
int fd, delim;
|
|
char *cc;
|
|
|
|
// Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
|
|
// eol status, and to retain the filename for error messages, we'd need
|
|
// to go up to arg5 just for this. Compromise: dynamically allocate the
|
|
// filehandle and eol status.
|
|
|
|
writenow:
|
|
while (isspace(*line)) line++;
|
|
if (!*line) goto error;
|
|
for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
|
|
delim = *cc;
|
|
*cc = 0;
|
|
fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
|
|
*cc = delim;
|
|
|
|
command->w = reg - (char *)command;
|
|
command = xrealloc(command, command->w+(cc-line)+6);
|
|
reg = command->w + (char *)command;
|
|
|
|
memcpy(reg, &fd, 4);
|
|
reg += 4;
|
|
*(reg++) = 0;
|
|
memcpy(reg, line, delim);
|
|
reg += delim;
|
|
*(reg++) = 0;
|
|
|
|
line = cc;
|
|
if (delim) line += 2;
|
|
} else if (c == 'y') {
|
|
char *s, delim = 0;
|
|
int len;
|
|
|
|
if (!(s = unescape_delimited_string(&line, &delim))) goto error;
|
|
command->arg1 = reg-(char *)command;
|
|
len = strlen(s);
|
|
reg = extend_string((void *)&command, s, reg-(char *)command, len);
|
|
free(s);
|
|
command->arg2 = reg-(char *)command;
|
|
if (!(s = unescape_delimited_string(&line, &delim))) goto error;
|
|
if (len != strlen(s)) goto error;
|
|
reg = extend_string((void *)&command, s, reg-(char*)command, len);
|
|
free(s);
|
|
} else if (strchr("abcirtTw:", c)) {
|
|
int end;
|
|
|
|
// trim leading spaces
|
|
while (isspace(*line) && *line != '\n') line++;
|
|
|
|
// Resume logic differs from 's' case because we don't add a newline
|
|
// unless it's after something, so we add it on return instead.
|
|
resume_a:
|
|
command->hit = 0;
|
|
|
|
// btT: end with space or semicolon, aicrw continue to newline.
|
|
if (!(end = strcspn(line, strchr(":btT", c) ? "; \t\r\n\v\f" : "\n"))) {
|
|
// Argument's optional for btT
|
|
if (strchr("btT", c)) continue;
|
|
else if (!command->arg1) break;
|
|
}
|
|
|
|
// Extend allocation to include new string. We use offsets instead of
|
|
// pointers so realloc() moving stuff doesn't break things. Ok to write
|
|
// \n over NUL terminator because call to extend_string() adds it back.
|
|
if (!command->arg1) command->arg1 = reg - (char*)command;
|
|
else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
|
|
else if (!pline) {
|
|
command->arg1 = 0;
|
|
continue;
|
|
}
|
|
reg = extend_string((void *)&command, line, reg - (char *)command, end);
|
|
|
|
// Recopy data to remove escape sequences and handle line continuation.
|
|
if (strchr("aci", c)) {
|
|
reg -= end+1;
|
|
for (i = end; i; i--) {
|
|
if ((*reg++ = *line++)=='\\') {
|
|
|
|
// escape at end of line: resume if -e escaped literal newline,
|
|
// else request callback and resume with next line
|
|
if (!--i) {
|
|
*--reg = 0;
|
|
if (*line) {
|
|
line++;
|
|
goto resume_a;
|
|
}
|
|
command->hit = 256;
|
|
break;
|
|
}
|
|
if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
|
|
line++;
|
|
}
|
|
}
|
|
*reg = 0;
|
|
} else line += end;
|
|
|
|
// Commands that take no arguments
|
|
} else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
|
|
}
|
|
|
|
error:
|
|
error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
|
|
}
|
|
|
|
void sed_main(void)
|
|
{
|
|
struct arg_list *al;
|
|
char **args = toys.optargs;
|
|
|
|
// Lie to autoconf when it asks stupid questions, so configure regexes
|
|
// that look for "GNU sed version %f" greater than some old buggy number
|
|
// don't fail us for not matching their narrow expectations.
|
|
if (toys.optflags & FLAG_version) {
|
|
xprintf("This is not GNU sed version 9.0\n");
|
|
return;
|
|
}
|
|
|
|
// Parse pattern into commands.
|
|
|
|
// If no -e or -f, first argument is the pattern.
|
|
if (!TT.e && !TT.f) {
|
|
if (!*toys.optargs) error_exit("no pattern");
|
|
(TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
|
|
}
|
|
|
|
// Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
|
|
// so handle all -e, then all -f. (At least the behavior's consistent.)
|
|
|
|
for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
|
|
for (al = TT.f; al; al = al->next) do_lines(xopenro(al->arg), parse_pattern);
|
|
parse_pattern(0, 0);
|
|
dlist_terminate(TT.pattern);
|
|
if (TT.nextlen) error_exit("no }");
|
|
|
|
TT.fdout = 1;
|
|
TT.remember = xstrdup("");
|
|
|
|
// Inflict pattern upon input files. Long version because !O_CLOEXEC
|
|
loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed);
|
|
|
|
if (!(toys.optflags & FLAG_i)) process_line(0, 0);
|
|
|
|
// todo: need to close fd when done for TOYBOX_FREE?
|
|
}
|