diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-11-29 00:10:49 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-11-29 00:10:49 +0000 |
commit | e40c4a72a7db47c7614518690749cf4093b911a0 (patch) | |
tree | b4b77bcd6c82504db09e7e1dbf5db683b59e7c45 | |
parent | 13f5893f3addb30e157763ad236e7c2156e4b198 (diff) | |
parent | b7a213c11b2b60f1a1be18a29753132f0376d47f (diff) | |
download | one-true-awk-android14-qpr2-s1-release.tar.gz |
Snap for 11151698 from b7a213c11b2b60f1a1be18a29753132f0376d47f to 24Q1-releaseandroid-14.0.0_r37android-14.0.0_r36android-14.0.0_r35android-14.0.0_r34android-14.0.0_r33android-14.0.0_r32android-14.0.0_r31android-14.0.0_r30android-14.0.0_r29android14-qpr2-s5-releaseandroid14-qpr2-s4-releaseandroid14-qpr2-s3-releaseandroid14-qpr2-s2-releaseandroid14-qpr2-s1-releaseandroid14-qpr2-release
Change-Id: I6ce1405dccdee1d3b100db984c44280619d335ec
-rw-r--r-- | FIXES | 24 | ||||
-rw-r--r-- | METADATA | 4 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | awk.1 | 3 | ||||
-rw-r--r-- | awk.h | 11 | ||||
-rw-r--r-- | b.c | 279 | ||||
-rwxr-xr-x | bugs-fixed/REGRESS | 2 | ||||
-rw-r--r-- | lex.c | 14 | ||||
-rw-r--r-- | main.c | 2 | ||||
-rw-r--r-- | makefile | 8 | ||||
-rw-r--r-- | maketab.c | 4 | ||||
-rw-r--r-- | proto.h | 3 | ||||
-rw-r--r-- | run.c | 273 | ||||
-rwxr-xr-x | testdir/Compare.tt | 2 | ||||
-rwxr-xr-x | testdir/REGRESS | 2 | ||||
-rwxr-xr-x | testdir/T.csv | 1 | ||||
-rwxr-xr-x | testdir/T.flags | 5 | ||||
-rwxr-xr-x | testdir/T.misc | 14 |
18 files changed, 356 insertions, 299 deletions
@@ -25,6 +25,29 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the second edition of the AWK book was published in September 2023. +Nov 24, 2023: + Fix issue #199: gototab improvements to dynamically resize the + table, qsort and bsearch to improve the lookup speed as the + table gets larger for multibyte input. thanks to Arnold Robbins. + +Nov 23, 2023: + Fix Issue #169, related to escape sequences in strings. + Thanks to Github user rajeevvp. + Fix Issue #147, reported by Github user drawkula, and fixed + by Miguel Pineiro Jr. + +Nov 20, 2023: + rewrite of fnematch to fix a number of issues, including + extraneous output, out-of-bounds access, number of bytes + to push back after a failed match etc. + thanks to Miguel Pineiro Jr. + +Nov 15, 2023: + Man page edit, regression test fixes. thanks to Arnold Robbins + consolidation of sub and gsub into dosub, removing duplicate + code. thanks to Miguel Pineiro Jr. + gcc replaced with cc everywhere. + Oct 30, 2023: multiple fixes and a minor code cleanup. disabled utf-8 for non-multibyte locales, such as C or POSIX. @@ -32,7 +55,6 @@ Oct 30, 2023: systems. also fixed an out-of-bounds read for empty CCL. fixed a buffer overflow in substr with utf-8 strings. many thanks to Todd C Miller. - Sep 24, 2023: fnematch and getrune have been overhauled to solve issues around @@ -9,11 +9,11 @@ third_party { type: GIT value: "https://github.com/onetrueawk/awk.git" } - version: "d801514094d1140dfc9f8571b9821082ddddf107" + version: "fbd1d5b712e27a9bb527e39ed6e9bf3b9afbb1df" license_type: NOTICE last_upgrade_date { year: 2023 month: 11 - day: 6 + day: 27 } } @@ -21,8 +21,6 @@ Aribtrary characters may be included with `\u` followed by 1 to 8 hexadecimal di ### Regular expressions ### Regular expressions may include UTF-8 code points, including `\u`. -Character classes are likely to be limited to about 256 characters -when expanded. ### CSV ### @@ -145,4 +143,4 @@ is not at the top of our priority list. #### Last Updated -Sun 15 Oct 2023 06:28:36 IDT +Mon 16 Oct 2023 11:23:08 IDT @@ -586,6 +586,9 @@ the syntax is worse. .PP Input is expected to be UTF-8 encoded. Other multibyte character sets are not handled. +However, in eight-bit locales, +.I awk +treats each input byte as a separate character. .SH UNUSUAL FLOATING-POINT VALUES .I Awk was designed before IEEE 754 arithmetic defined Not-A-Number (NaN) @@ -246,14 +246,19 @@ typedef struct rrow { int *lfollow; } rrow; -typedef struct gtt { /* gototab entry */ +typedef struct gtte { /* gototab entry */ unsigned int ch; unsigned int state; +} gtte; + +typedef struct gtt { /* gototab */ + size_t allocated; + size_t inuse; + gtte *entries; } gtt; typedef struct fa { - gtt **gototab; - int gototab_len; + gtt *gototab; uschar *out; uschar *restr; int **posns; @@ -96,9 +96,8 @@ extern int u8_nextlen(const char *s); mechanism of the goto table used 8-bit byte indices into the gototab entries to compute the next state. Unicode is a lot bigger, so the gototab entries are now structs with a character - and a next state, and there is a linear search of the characters - to find the state. (Yes, this is slower, by a significant - amount. Tough.) + and a next state. These are sorted by code point and binary + searched. Throughout the RE mechanism in b.c, utf-8 characters are converted to their utf-32 value. This mostly shows up in @@ -113,8 +112,10 @@ extern int u8_nextlen(const char *s); */ +static int entry_cmp(const void *l, const void *r); static int get_gototab(fa*, int, int); static int set_gototab(fa*, int, int, int); +static void clear_gototab(fa*, int); extern int u8_rune(int *, const uschar *); static int * @@ -142,7 +143,7 @@ resizesetvec(const char *f) static void resize_state(fa *f, int state) { - gtt **p; + gtt *p; uschar *p2; int **p3; int i, new_count; @@ -152,7 +153,7 @@ resize_state(fa *f, int state) new_count = state + 10; /* needs to be tuned */ - p = (gtt **) realloc(f->gototab, new_count * sizeof(f->gototab[0])); + p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt)); if (p == NULL) goto out; f->gototab = p; @@ -168,13 +169,14 @@ resize_state(fa *f, int state) f->posns = p3; for (i = f->state_count; i < new_count; ++i) { - f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab)); - if (f->gototab[i] == NULL) + f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte)); + if (f->gototab[i].entries == NULL) goto out; - f->out[i] = 0; + f->gototab[i].allocated = NCHARS; + f->gototab[i].inuse = 0; + f->out[i] = 0; f->posns[i] = NULL; } - f->gototab_len = NCHARS; /* should be variable, growable */ f->state_count = new_count; return; out: @@ -268,8 +270,7 @@ int makeinit(fa *f, bool anchor) } if ((f->posns[2])[1] == f->accept) f->out[2] = 1; - for (i = 0; i < NCHARS; i++) - set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */ + clear_gototab(f, 2); f->curstat = cgoto(f, 2, HAT); if (anchor) { *f->posns[2] = k-1; /* leave out position 0 */ @@ -595,32 +596,104 @@ int member(int c, int *sarg) /* is c in s? */ return(0); } +static void resize_gototab(fa *f, int state) +{ + size_t new_size = f->gototab[state].allocated * 2; + gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); + if (p == NULL) + overflo(__func__); + + // need to initialized the new memory to zero + size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size + memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out + + f->gototab[state].allocated = new_size; // update gotottab info + f->gototab[state].entries = p; +} + static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ { - int i; - for (i = 0; i < f->gototab_len; i++) { - if (f->gototab[state][i].ch == 0) - break; - if (f->gototab[state][i].ch == ch) - return f->gototab[state][i].state; - } - return 0; + gtte key; + gtte *item; + + key.ch = ch; + key.state = 0; /* irrelevant */ + item = bsearch(& key, f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), + entry_cmp); + + if (item == NULL) + return 0; + else + return item->state; +} + +static int entry_cmp(const void *l, const void *r) +{ + const gtte *left, *right; + + left = (const gtte *) l; + right = (const gtte *) r; + + return left->ch - right->ch; } static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ { - int i; - for (i = 0; i < f->gototab_len; i++) { - if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) { - f->gototab[state][i].ch = ch; - f->gototab[state][i].state = val; - return val; + if (f->gototab[state].inuse == 0) { + f->gototab[state].entries[0].ch = ch; + f->gototab[state].entries[0].state = val; + f->gototab[state].inuse++; + return val; + } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) { + // not seen yet, insert and return + gtt *tab = & f->gototab[state]; + if (tab->inuse + 1 >= tab->allocated) + resize_gototab(f, state); + + f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch; + f->gototab[state].entries[f->gototab[state].inuse-1].state = val; + f->gototab[state].inuse++; + return val; + } else { + // maybe we have it, maybe we don't + gtte key; + gtte *item; + + key.ch = ch; + key.state = 0; /* irrelevant */ + item = bsearch(& key, f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), + entry_cmp); + + if (item != NULL) { + // we have it, update state and return + item->state = val; + return item->state; } + // otherwise, fall through to insert and reallocate. } - overflo(__func__); + + gtt *tab = & f->gototab[state]; + if (tab->inuse + 1 >= tab->allocated) + resize_gototab(f, state); + ++tab->inuse; + f->gototab[state].entries[tab->inuse].ch = ch; + f->gototab[state].entries[tab->inuse].state = val; + + qsort(f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), entry_cmp); + return val; /* not used anywhere at the moment */ } +static void clear_gototab(fa *f, int state) +{ + memset(f->gototab[state].entries, 0, + f->gototab[state].allocated * sizeof(gtte)); + f->gototab[state].inuse = 0; +} + int match(fa *f, const char *p0) /* shortest match ? */ { int s, ns; @@ -759,59 +832,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */ #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long -// Read one rune at a time from the given FILE*. Return both -// the bytes and the actual rune. - -struct runedata { - int rune; - size_t len; - char bytes[6]; -}; - -struct runedata getrune(FILE *fp) -{ - struct runedata result; - int c, next; - - memset(&result, 0, sizeof(result)); - - c = getc(fp); - if (c == EOF) - return result; // result.rune == 0 --> EOF - else if (c < 128 || awk_mb_cur_max == 1) { - result.bytes[0] = c; - result.len = 1; - result.rune = c; - - return result; - } - - // need to get bytes and fill things in - result.bytes[0] = c; - result.len = 1; - - next = 1; - for (int i = 1; i < MAX_UTF_BYTES; i++) { - c = getc(fp); - if (c == EOF) - break; - result.bytes[next++] = c; - result.len++; - } - - // put back any extra input bytes - int actual_len = u8_nextlen(result.bytes); - while (result.len > actual_len) { - ungetc(result.bytes[--result.len], fp); - } - - result.bytes[result.len] = '\0'; - (void) u8_rune(& result.rune, (uschar *) result.bytes); - - return result; -} - - /* * NAME * fnematch @@ -829,58 +849,76 @@ struct runedata getrune(FILE *fp) bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) { - char *buf = *pbuf; + char *i, *j, *k, *buf = *pbuf; int bufsize = *pbufsize; - int i, j, k, ns, s; - struct runedata r; + int c, n, ns, s; s = pfa->initstat; patlen = 0; /* - * All indices relative to buf. - * i <= j <= k <= bufsize + * buf <= i <= j <= k <= buf+bufsize * - * i: origin of active substring (first byte of first character) - * j: current character (last byte of current character) - * k: destination of next getc() + * i: origin of active substring + * j: current character + * k: destination of the next getc */ - i = -1, k = 0; - do { - j = i++; - do { - r = getrune(f); - if ((++j + r.len) >= k) { - if (k >= bufsize) - if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) - FATAL("stream '%.30s...' too long", buf); - } - memcpy(buf + k, r.bytes, r.len); - j += r.len - 1; // incremented next time around the loop - k += r.len; - if ((ns = get_gototab(pfa, s, r.rune)) != 0) - s = ns; - else - s = cgoto(pfa, s, r.rune); + i = j = k = buf; - if (pfa->out[s]) { /* final state */ - patlen = j - i + 1; - if (r.rune == 0) /* don't count $ */ - patlen--; + do { + /* + * Call u8_rune with at least MAX_UTF_BYTES ahead in + * the buffer until EOF interferes. + */ + if (k - j < MAX_UTF_BYTES) { + if (k + MAX_UTF_BYTES > buf + bufsize) { + adjbuf((char **) &buf, &bufsize, + bufsize + MAX_UTF_BYTES, + quantum, 0, "fnematch"); } - } while (buf[j] && s != 1); + for (n = MAX_UTF_BYTES ; n > 0; n--) { + *k++ = (c = getc(f)) != EOF ? c : 0; + if (c == EOF) { + if (ferror(f)) + FATAL("fnematch: getc error"); + break; + } + } + } + + j += u8_rune(&c, (uschar *)j); + + if ((ns = get_gototab(pfa, s, c)) != 0) + s = ns; + else + s = cgoto(pfa, s, c); + + if (pfa->out[s]) { /* final state */ + patbeg = i; + patlen = j - i; + if (c == 0) /* don't count $ */ + patlen--; + } + + if (c && s != 1) + continue; /* origin i still viable, next j */ + if (patlen) + break; /* best match found */ + + /* no match at origin i, next i and start over */ + i += u8_rune(&c, (uschar *)i); + if (c == 0) + break; /* no match */ + j = i; s = 2; - if (r.len > 1) - i += r.len - 1; // i incremented around the loop - } while (buf[i] && !patlen); + } while (1); /* adjbuf() may have relocated a resized buffer. Inform the world. */ *pbuf = buf; *pbufsize = bufsize; if (patlen) { - patbeg = (char *) buf + i; /* * Under no circumstances is the last character fed to * the automaton part of the match. It is EOF's nullbyte, @@ -893,11 +931,10 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) * terminate the buffer. */ do - for (int ii = r.len; ii > 0; ii--) - if (buf[--k] && ungetc(buf[k], f) == EOF) - FATAL("unable to ungetc '%c'", buf[k]); - while (k > i + patlen); - buf[k] = '\0'; + if (*--k && ungetc(*k, f) == EOF) + FATAL("unable to ungetc '%c'", *k); + while (k > patbeg + patlen); + *k = '\0'; return true; } else @@ -1486,8 +1523,7 @@ int cgoto(fa *f, int s, int c) /* add tmpset to current set of states */ ++(f->curstat); resize_state(f, f->curstat); - for (i = 0; i < NCHARS; i++) - set_gototab(f, f->curstat, 0, 0); + clear_gototab(f, f->curstat); xfree(f->posns[f->curstat]); p = intalloc(setcnt + 1, __func__); @@ -1511,7 +1547,8 @@ void freefa(fa *f) /* free a finite automaton */ if (f == NULL) return; for (i = 0; i < f->state_count; i++) - xfree(f->gototab[i]) + xfree(f->gototab[i].entries); + xfree(f->gototab); for (i = 0; i <= f->curstat; i++) xfree(f->posns[i]); for (i = 0; i <= f->accept; i++) { diff --git a/bugs-fixed/REGRESS b/bugs-fixed/REGRESS index 0716003..98d578a 100755 --- a/bugs-fixed/REGRESS +++ b/bugs-fixed/REGRESS @@ -1,4 +1,4 @@ -#! /bin/bash +#! /bin/sh if [ ! -f ../a.out ] then @@ -421,8 +421,12 @@ int string(void) { int i; + if (!isxdigit(peek())) { + unput(c); + break; + } n = 0; - for (i = 1; i <= 2; i++) { + for (i = 0; i < 2; i++) { c = input(); if (c == 0) break; @@ -433,13 +437,13 @@ int string(void) n += (c - '0'); else n += 10 + (c - 'a'); - } else + } else { + unput(c); break; + } } - if (n) + if (i) *bp++ = n; - else - unput(c); break; } @@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20231030"; +const char *version = "version 20231124"; #define DEBUG #include <stdio.h> @@ -28,10 +28,10 @@ CFLAGS = CFLAGS = -O2 # compiler options -#CC = gcc -Wall -g -Wwrite-strings -#CC = gcc -O4 -Wall -pedantic -fno-strict-aliasing -#CC = gcc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov -HOSTCC = gcc -g -Wall -pedantic -Wcast-qual +#CC = cc -Wall -g -Wwrite-strings +#CC = cc -O4 -Wall -pedantic -fno-strict-aliasing +#CC = cc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov +HOSTCC = cc -g -Wall -pedantic -Wcast-qual CC = $(HOSTCC) # change this is cross-compiling. # By fiat, to make our lives easier, yacc is now defined to be bison. @@ -52,8 +52,8 @@ struct xx { ARRAY, "array", NULL }, { INDIRECT, "indirect", "$(" }, { SUBSTR, "substr", "substr" }, - { SUB, "sub", "sub" }, - { GSUB, "gsub", "gsub" }, + { SUB, "dosub", "sub" }, + { GSUB, "dosub", "gsub" }, { INDEX, "sindex", "sindex" }, { SPRINTF, "awksprintf", "sprintf " }, { ADD, "arith", " + " }, @@ -196,8 +196,7 @@ extern FILE *openfile(int, const char *, bool *); extern const char *filename(FILE *); extern Cell *closefile(Node **, int); extern void closeall(void); -extern Cell *sub(Node **, int); -extern Cell *gsub(Node **, int); +extern Cell *dosub(Node **, int); extern FILE *popen(const char *, const char *); extern int pclose(FILE *); @@ -1540,8 +1540,9 @@ Cell *assign(Node **a, int n) /* a[0] = a[1], a[0] += a[1], etc. */ if (x == y && !(x->tval & (FLD|REC)) && x != nfloc) ; /* self-assignment: leave alone unless it's a field or NF */ else if ((y->tval & (STR|NUM)) == (STR|NUM)) { + yf = getfval(y); setsval(x, getsval(y)); - x->fval = getfval(y); + x->fval = yf; x->tval |= NUM; } else if (isstr(y)) @@ -2397,169 +2398,143 @@ static void flush_all(void) void backsub(char **pb_ptr, const char **sptr_ptr); -Cell *sub(Node **a, int nnn) /* substitute command */ +Cell *dosub(Node **a, int subop) /* sub and gsub */ { - const char *sptr, *q; - Cell *x, *y, *result; - char *t, *buf, *pb; fa *pfa; + int tempstat; + char *repl; + Cell *x; + + char *buf = NULL; + char *pb = NULL; int bufsz = recsize; - if ((buf = (char *) malloc(bufsz)) == NULL) - FATAL("out of memory in sub"); - x = execute(a[3]); /* target string */ - t = getsval(x); - if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */ - pfa = (fa *) a[1]; /* regular expression */ - else { - y = execute(a[1]); - pfa = makedfa(getsval(y), 1); - tempfree(y); + const char *r, *s; + const char *start; + const char *noempty = NULL; /* empty match disallowed here */ + size_t m = 0; /* match count */ + size_t whichm; /* which match to select, 0 = global */ + int mtype; /* match type */ + + if (a[0] == NULL) { /* 0 => a[1] is already-compiled regexpr */ + pfa = (fa *) a[1]; + } else { + x = execute(a[1]); + pfa = makedfa(getsval(x), 1); + tempfree(x); } - y = execute(a[2]); /* replacement string */ - result = False; - if (pmatch(pfa, t)) { - sptr = t; - adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub"); - pb = buf; - while (sptr < patbeg) - *pb++ = *sptr++; - sptr = getsval(y); - while (*sptr != '\0') { - adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub"); - if (*sptr == '\\') { - backsub(&pb, &sptr); - } else if (*sptr == '&') { - sptr++; - adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub"); - for (q = patbeg; q < patbeg+patlen; ) - *pb++ = *q++; - } else - *pb++ = *sptr++; + + x = execute(a[2]); /* replacement string */ + repl = tostring(getsval(x)); + tempfree(x); + + switch (subop) { + case SUB: + whichm = 1; + x = execute(a[3]); /* source string */ + break; + case GSUB: + whichm = 0; + x = execute(a[3]); /* source string */ + break; + default: + FATAL("dosub: unrecognized subop: %d", subop); + } + + start = getsval(x); + while (pmatch(pfa, start)) { + if (buf == NULL) { + if ((pb = buf = malloc(bufsz)) == NULL) + FATAL("out of memory in dosub"); + tempstat = pfa->initstat; + pfa->initstat = 2; } - *pb = '\0'; - if (pb > buf + bufsz) - FATAL("sub result1 %.30s too big; can't happen", buf); - sptr = patbeg + patlen; - if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) { - adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub"); - while ((*pb++ = *sptr++) != '\0') - continue; + + /* match types */ + #define MT_IGNORE 0 /* unselected or invalid */ + #define MT_INSERT 1 /* selected, empty */ + #define MT_REPLACE 2 /* selected, not empty */ + + /* an empty match just after replacement is invalid */ + + if (patbeg == noempty && patlen == 0) { + mtype = MT_IGNORE; /* invalid, not counted */ + } else if (whichm == ++m || whichm == 0) { + mtype = patlen ? MT_REPLACE : MT_INSERT; + } else { + mtype = MT_IGNORE; /* unselected, but counted */ } - if (pb > buf + bufsz) - FATAL("sub result2 %.30s too big; can't happen", buf); - setsval(x, buf); /* BUG: should be able to avoid copy */ - result = True; - } - tempfree(x); - tempfree(y); - free(buf); - return result; -} -Cell *gsub(Node **a, int nnn) /* global substitute */ -{ - Cell *x, *y; - char *rptr, *pb; - const char *q, *t, *sptr; - char *buf; - fa *pfa; - int mflag, tempstat, num; - int bufsz = recsize; - int charlen = 0; + /* leading text: */ + if (patbeg > start) { + adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start), + recsize, &pb, "dosub"); + s = start; + while (s < patbeg) + *pb++ = *s++; + } - if ((buf = (char *) malloc(bufsz)) == NULL) - FATAL("out of memory in gsub"); - mflag = 0; /* if mflag == 0, can replace empty string */ - num = 0; - x = execute(a[3]); /* target string */ - t = getsval(x); - if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */ - pfa = (fa *) a[1]; /* regular expression */ - else { - y = execute(a[1]); - pfa = makedfa(getsval(y), 1); - tempfree(y); - } - y = execute(a[2]); /* replacement string */ - if (pmatch(pfa, t)) { - tempstat = pfa->initstat; - pfa->initstat = 2; - pb = buf; - rptr = getsval(y); - do { - if (patlen == 0 && *patbeg != '\0') { /* matched empty string */ - if (mflag == 0) { /* can replace empty */ - num++; - sptr = rptr; - while (*sptr != '\0') { - adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub"); - if (*sptr == '\\') { - backsub(&pb, &sptr); - } else if (*sptr == '&') { - sptr++; - adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub"); - for (q = patbeg; q < patbeg+patlen; ) - *pb++ = *q++; - } else - *pb++ = *sptr++; - } - } - if (*t == '\0') /* at end */ - goto done; - adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub"); - charlen = u8_nextlen(t); - while (charlen-- > 0) - *pb++ = *t++; - if (pb > buf + bufsz) /* BUG: not sure of this test */ - FATAL("gsub result0 %.30s too big; can't happen", buf); - mflag = 0; + if (mtype == MT_IGNORE) + goto matching_text; /* skip replacement text */ + + r = repl; + while (*r != 0) { + adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub"); + if (*r == '\\') { + backsub(&pb, &r); + } else if (*r == '&') { + r++; + adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, + &pb, "dosub"); + for (s = patbeg; s < patbeg+patlen; ) + *pb++ = *s++; + } else { + *pb++ = *r++; } - else { /* matched nonempty string */ - num++; - sptr = t; - adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub"); - while (sptr < patbeg) - *pb++ = *sptr++; - sptr = rptr; - while (*sptr != '\0') { - adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub"); - if (*sptr == '\\') { - backsub(&pb, &sptr); - } else if (*sptr == '&') { - sptr++; - adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub"); - for (q = patbeg; q < patbeg+patlen; ) - *pb++ = *q++; - } else - *pb++ = *sptr++; - } - t = patbeg + patlen; - if (patlen == 0 || *t == '\0' || *(t-1) == '\0') - goto done; - if (pb > buf + bufsz) - FATAL("gsub result1 %.30s too big; can't happen", buf); - mflag = 1; - } - } while (pmatch(pfa,t)); - sptr = t; - adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub"); - while ((*pb++ = *sptr++) != '\0') - continue; - done: if (pb < buf + bufsz) - *pb = '\0'; - else if (*(pb-1) != '\0') - FATAL("gsub result2 %.30s truncated; can't happen", buf); - setsval(x, buf); /* BUG: should be able to avoid copy + free */ + } + +matching_text: + if (mtype == MT_REPLACE || *patbeg == '\0') + goto next_search; /* skip matching text */ + + if (patlen == 0) + patlen = u8_nextlen(patbeg); + adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub"); + s = patbeg; + while (s < patbeg + patlen) + *pb++ = *s++; + +next_search: + start = patbeg + patlen; + if (m == whichm || *patbeg == '\0') + break; + if (mtype == MT_REPLACE) + noempty = start; + + #undef MT_IGNORE + #undef MT_INSERT + #undef MT_REPLACE + } + + xfree(repl); + + if (buf != NULL) { pfa->initstat = tempstat; + + /* trailing text */ + adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub"); + while ((*pb++ = *start++) != '\0') + ; + + setsval(x, buf); + free(buf); } + tempfree(x); - tempfree(y); x = gettemp(); x->tval = NUM; - x->fval = num; - free(buf); - return(x); + x->fval = m; + return x; } void backsub(char **pb_ptr, const char **sptr_ptr) /* handle \\& variations */ diff --git a/testdir/Compare.tt b/testdir/Compare.tt index ca828d2..4b297d7 100755 --- a/testdir/Compare.tt +++ b/testdir/Compare.tt @@ -4,7 +4,7 @@ oldawk=${oldawk-awk} awk=${awk-../a.out} echo compiling time.c -gcc time.c -o time +cc time.c -o time time=./time echo time command = $time diff --git a/testdir/REGRESS b/testdir/REGRESS index 5c3667f..b54ce3f 100755 --- a/testdir/REGRESS +++ b/testdir/REGRESS @@ -1,7 +1,7 @@ #!/bin/sh uname -a -gcc echo.c -o echo && echo echo compiled +cc echo.c -o echo && echo echo compiled oldawk=${oldawk-awk} awk=${awk-../a.out} diff --git a/testdir/T.csv b/testdir/T.csv index 10da1ea..79c1510 100755 --- a/testdir/T.csv +++ b/testdir/T.csv @@ -77,5 +77,4 @@ a''b [a''b] a, [a][] "", [][] , [][] -a"b [a"b] !!!! diff --git a/testdir/T.flags b/testdir/T.flags index 33d7c8d..17ce561 100755 --- a/testdir/T.flags +++ b/testdir/T.flags @@ -20,5 +20,6 @@ grep 'unknown option' foo >/dev/null || echo 'T.flags: bad unknown option' $awk -F >foo 2>&1 grep 'no field separator' foo >/dev/null || echo 'T.flags: bad missing field separator' -$awk -F '' >foo 2>&1 -grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator' +### Awk is now like gawk and splits into separate characters if FS = "" +# $awk -F '' >foo 2>&1 +# grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator' diff --git a/testdir/T.misc b/testdir/T.misc index 1e5c3c5..b8ed3c1 100755 --- a/testdir/T.misc +++ b/testdir/T.misc @@ -510,3 +510,17 @@ cmp -s foo1 foo2 || echo 'BAD: T.misc exit status on I/O error' echo 1b >foo1 echo ab | $awk '{ sub(/a/, "b" ~ /b/); print }' >foo2 cmp -s foo1 foo2 || echo 'BAD: T.misc lexer regex buffer clobbered' + +# Check handling of octal \OOO and hex \xHH esc. seqs. in strings. +echo 'hello888 +hello +hello +helloxGOO +hello +0A' > foo1 +$awk 'BEGIN { print "hello\888" }' > foo2 +$awk 'BEGIN { print "hello\x000A" }' >> foo2 +$awk 'BEGIN { printf "hello\x0A" }' >> foo2 +$awk 'BEGIN { print "hello\xGOO" }' >> foo2 +$awk 'BEGIN { print "hello\x0A0A" }' >> foo2 +cmp -s foo1 foo2 || echo '�BAD: T.misc escape sequences in strings mishandled' |