From 328f0e4eddc575d7072c17e6082d91bb9851c878 Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Tue, 10 Oct 2023 20:47:17 +0300 Subject: First round, dynamically grow entries in a gototab. --- awk.h | 10 +++++++--- b.c | 42 +++++++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/awk.h b/awk.h index 49b5dfc..5901810 100644 --- a/awk.h +++ b/awk.h @@ -244,14 +244,18 @@ typedef struct rrow { int *lfollow; } rrow; -typedef struct gtt { /* gototab entry */ +typedef struct gtte { /* gototab entry */ unsigned int ch; unsigned int state; +} gtte; + +typedef struct gtt { /* gototab */ + size_t allocated; + gtte *entries; } gtt; typedef struct fa { - gtt **gototab; - int gototab_len; + gtt *gototab; uschar *out; uschar *restr; int **posns; diff --git a/b.c b/b.c index 55b320e..7a97586 100644 --- a/b.c +++ b/b.c @@ -142,7 +142,7 @@ resizesetvec(const char *f) static void resize_state(fa *f, int state) { - gtt **p; + gtt *p; uschar *p2; int **p3; int i, new_count; @@ -152,7 +152,7 @@ resize_state(fa *f, int state) new_count = state + 10; /* needs to be tuned */ - p = (gtt **) realloc(f->gototab, new_count * sizeof(f->gototab[0])); + p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt)); if (p == NULL) goto out; f->gototab = p; @@ -168,13 +168,13 @@ resize_state(fa *f, int state) f->posns = p3; for (i = f->state_count; i < new_count; ++i) { - f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab)); - if (f->gototab[i] == NULL) + f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte)); + if (f->gototab[i].entries == NULL) goto out; - f->out[i] = 0; + f->gototab[i].allocated = NCHARS; + f->out[i] = 0; f->posns[i] = NULL; } - f->gototab_len = NCHARS; /* should be variable, growable */ f->state_count = new_count; return; out: @@ -268,7 +268,7 @@ int makeinit(fa *f, bool anchor) } if ((f->posns[2])[1] == f->accept) f->out[2] = 1; - for (i = 0; i < NCHARS; i++) + for (i = 0; i < f->gototab[2].allocated; i++) set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */ f->curstat = cgoto(f, 2, HAT); if (anchor) { @@ -598,11 +598,11 @@ int member(int c, int *sarg) /* is c in s? */ static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ { int i; - for (i = 0; i < f->gototab_len; i++) { - if (f->gototab[state][i].ch == 0) + for (i = 0; i < f->gototab[state].allocated; i++) { + if (f->gototab[state].entries[i].ch == 0) break; - if (f->gototab[state][i].ch == ch) - return f->gototab[state][i].state; + if (f->gototab[state].entries[i].ch == ch) + return f->gototab[state].entries[i].state; } return 0; } @@ -610,14 +610,22 @@ static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ { int i; - for (i = 0; i < f->gototab_len; i++) { - if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) { - f->gototab[state][i].ch = ch; - f->gototab[state][i].state = val; + gtte *p; + for (i = 0; i < f->gototab[state].allocated; i++) { + if (f->gototab[state].entries[i].ch == 0 || f->gototab[state].entries[i].ch == ch) { + f->gototab[state].entries[i].ch = ch; + f->gototab[state].entries[i].state = val; return val; } } - overflo(__func__); + p = realloc(f->gototab[state].entries, ++(f->gototab[state].allocated) * sizeof(gtte)); + if (p == NULL) + overflo(__func__); + + f->gototab[state].entries = p; + f->gototab[state].entries[i].ch = ch; + f->gototab[state].entries[i].state = val; + return val; /* not used anywhere at the moment */ } @@ -1511,7 +1519,7 @@ void freefa(fa *f) /* free a finite automaton */ if (f == NULL) return; for (i = 0; i < f->state_count; i++) - xfree(f->gototab[i]) + xfree(f->gototab[i].entries) for (i = 0; i <= f->curstat; i++) xfree(f->posns[i]); for (i = 0; i <= f->accept; i++) { -- cgit v1.2.3 From e7ad51dc32aa7b61dfce903c82c82f75b93bfae7 Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Tue, 10 Oct 2023 21:05:18 +0300 Subject: Sort the gototab entries and binary search them. --- b.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/b.c b/b.c index 7a97586..73ac84c 100644 --- a/b.c +++ b/b.c @@ -96,9 +96,8 @@ extern int u8_nextlen(const char *s); mechanism of the goto table used 8-bit byte indices into the gototab entries to compute the next state. Unicode is a lot bigger, so the gototab entries are now structs with a character - and a next state, and there is a linear search of the characters - to find the state. (Yes, this is slower, by a significant - amount. Tough.) + and a next state. These are sorted by code point and binary + searched. Throughout the RE mechanism in b.c, utf-8 characters are converted to their utf-32 value. This mostly shows up in @@ -113,6 +112,7 @@ extern int u8_nextlen(const char *s); */ +static int entry_cmp(const void *l, const void *r); static int get_gototab(fa*, int, int); static int set_gototab(fa*, int, int, int); extern int u8_rune(int *, const uschar *); @@ -597,6 +597,7 @@ int member(int c, int *sarg) /* is c in s? */ static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ { +#if 0 int i; for (i = 0; i < f->gototab[state].allocated; i++) { if (f->gototab[state].entries[i].ch == 0) @@ -605,6 +606,32 @@ static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation return f->gototab[state].entries[i].state; } return 0; +#else + + gtte key; + gtte *item; + + key.ch = ch; + key.state = 0; /* irrelevant */ + item = bsearch(& key, f->gototab[state].entries, + f->gototab[state].allocated, sizeof(gtte), + entry_cmp); + + if (item == NULL) + return 0; + else + return item->state; +#endif +} + +static int entry_cmp(const void *l, const void *r) +{ + const gtte *left, *right; + + left = (const gtte *) l; + right = (const gtte *) r; + + return left->ch - right->ch; } static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ @@ -626,6 +653,8 @@ static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplem f->gototab[state].entries[i].ch = ch; f->gototab[state].entries[i].state = val; + qsort(p, f->gototab[state].allocated, sizeof(gtte), entry_cmp); + return val; /* not used anywhere at the moment */ } -- cgit v1.2.3 From fa355f3eb97cdd89a525d61e4b4118a27fff26f7 Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Wed, 11 Oct 2023 09:01:04 +0300 Subject: Track inuse vs. allocated, use bsearch in set_gototab. --- awk.h | 1 + b.c | 54 ++++++++++++++++++++++++++++++++---------------------- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/awk.h b/awk.h index 5901810..6a9c901 100644 --- a/awk.h +++ b/awk.h @@ -251,6 +251,7 @@ typedef struct gtte { /* gototab entry */ typedef struct gtt { /* gototab */ size_t allocated; + size_t inuse; gtte *entries; } gtt; diff --git a/b.c b/b.c index 73ac84c..ad1fd49 100644 --- a/b.c +++ b/b.c @@ -172,6 +172,7 @@ resize_state(fa *f, int state) if (f->gototab[i].entries == NULL) goto out; f->gototab[i].allocated = NCHARS; + f->gototab[i].inuse = 0; f->out[i] = 0; f->posns[i] = NULL; } @@ -268,7 +269,7 @@ int makeinit(fa *f, bool anchor) } if ((f->posns[2])[1] == f->accept) f->out[2] = 1; - for (i = 0; i < f->gototab[2].allocated; i++) + for (i = 0; i < f->gototab[2].inuse; i++) set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */ f->curstat = cgoto(f, 2, HAT); if (anchor) { @@ -597,31 +598,19 @@ int member(int c, int *sarg) /* is c in s? */ static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ { -#if 0 - int i; - for (i = 0; i < f->gototab[state].allocated; i++) { - if (f->gototab[state].entries[i].ch == 0) - break; - if (f->gototab[state].entries[i].ch == ch) - return f->gototab[state].entries[i].state; - } - return 0; -#else - gtte key; gtte *item; key.ch = ch; key.state = 0; /* irrelevant */ item = bsearch(& key, f->gototab[state].entries, - f->gototab[state].allocated, sizeof(gtte), + f->gototab[state].inuse, sizeof(gtte), entry_cmp); if (item == NULL) return 0; else return item->state; -#endif } static int entry_cmp(const void *l, const void *r) @@ -636,8 +625,8 @@ static int entry_cmp(const void *l, const void *r) static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ { +#if 0 int i; - gtte *p; for (i = 0; i < f->gototab[state].allocated; i++) { if (f->gototab[state].entries[i].ch == 0 || f->gototab[state].entries[i].ch == ch) { f->gototab[state].entries[i].ch = ch; @@ -645,15 +634,36 @@ static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplem return val; } } - p = realloc(f->gototab[state].entries, ++(f->gototab[state].allocated) * sizeof(gtte)); - if (p == NULL) - overflo(__func__); +#else + gtte key; + gtte *item, *p; - f->gototab[state].entries = p; - f->gototab[state].entries[i].ch = ch; - f->gototab[state].entries[i].state = val; + key.ch = ch; + key.state = 0; /* irrelevant */ + item = bsearch(& key, f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), + entry_cmp); + + if (item != NULL) { + item->state = state; + return item->state; + } +#endif + gtt *tab = & f->gototab[state]; + if (tab->inuse + 1 >= tab->allocated) { + size_t new_size = tab->allocated * 2; + p = realloc(f->gototab[state].entries, new_size * sizeof(gtte)); + if (p == NULL) + overflo(__func__); + f->gototab[state].allocated = new_size; + f->gototab[state].entries = p; + } + ++tab->inuse; + f->gototab[state].entries[tab->inuse].ch = ch; + f->gototab[state].entries[tab->inuse].state = val; - qsort(p, f->gototab[state].allocated, sizeof(gtte), entry_cmp); + qsort(f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), entry_cmp); return val; /* not used anywhere at the moment */ } -- cgit v1.2.3 From f4f0b0dd085ee82bfca2d761caeb07a9a5ba8d2c Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Fri, 13 Oct 2023 08:33:34 +0300 Subject: Two small fixes. --- b.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/b.c b/b.c index ad1fd49..f6108a6 100644 --- a/b.c +++ b/b.c @@ -269,7 +269,7 @@ int makeinit(fa *f, bool anchor) } if ((f->posns[2])[1] == f->accept) f->out[2] = 1; - for (i = 0; i < f->gototab[2].inuse; i++) + for (i = 0; i < f->gototab[2].allocated; i++) set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */ f->curstat = cgoto(f, 2, HAT); if (anchor) { @@ -645,7 +645,7 @@ static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplem entry_cmp); if (item != NULL) { - item->state = state; + item->state = val; return item->state; } #endif -- cgit v1.2.3 From 220fd4eb0abc25eb923a4f98d34dae3fd63c1b37 Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Fri, 13 Oct 2023 10:44:08 +0300 Subject: Be smarter about clearing a goto table. --- b.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/b.c b/b.c index f6108a6..df1ae8b 100644 --- a/b.c +++ b/b.c @@ -115,6 +115,7 @@ extern int u8_nextlen(const char *s); static int entry_cmp(const void *l, const void *r); static int get_gototab(fa*, int, int); static int set_gototab(fa*, int, int, int); +static void clear_gototab(fa*, int); extern int u8_rune(int *, const uschar *); static int * @@ -269,8 +270,7 @@ int makeinit(fa *f, bool anchor) } if ((f->posns[2])[1] == f->accept) f->out[2] = 1; - for (i = 0; i < f->gototab[2].allocated; i++) - set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */ + clear_gototab(f, 2); f->curstat = cgoto(f, 2, HAT); if (anchor) { *f->posns[2] = k-1; /* leave out position 0 */ @@ -668,6 +668,13 @@ static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplem return val; /* not used anywhere at the moment */ } +static void clear_gototab(fa *f, int state) +{ + memset(f->gototab[state].entries, 0, + f->gototab[state].allocated * sizeof(gtte)); + f->gototab[state].inuse = 0; +} + int match(fa *f, const char *p0) /* shortest match ? */ { int s, ns; @@ -1533,8 +1540,7 @@ int cgoto(fa *f, int s, int c) /* add tmpset to current set of states */ ++(f->curstat); resize_state(f, f->curstat); - for (i = 0; i < NCHARS; i++) - set_gototab(f, f->curstat, 0, 0); + clear_gototab(f, f->curstat); xfree(f->posns[f->curstat]); p = intalloc(setcnt + 1, __func__); -- cgit v1.2.3 From bae27a93c1298f3c7fcb1cd10e690c59a4fcc3f1 Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Sat, 14 Oct 2023 20:30:00 +0300 Subject: Speed it back up. --- b.c | 60 ++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/b.c b/b.c index df1ae8b..b8782b1 100644 --- a/b.c +++ b/b.c @@ -625,34 +625,50 @@ static int entry_cmp(const void *l, const void *r) static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ { -#if 0 - int i; - for (i = 0; i < f->gototab[state].allocated; i++) { - if (f->gototab[state].entries[i].ch == 0 || f->gototab[state].entries[i].ch == ch) { - f->gototab[state].entries[i].ch = ch; - f->gototab[state].entries[i].state = val; - return val; + if (f->gototab[state].inuse == 0) { + f->gototab[state].entries[0].ch = ch; + f->gototab[state].entries[0].state = val; + f->gototab[state].inuse++; + return val; + } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) { + // not seen yet, insert and return + // FIXME: (Oz, hint, hint): Resizing should be pulled out into a function... + gtt *tab = & f->gototab[state]; + if (tab->inuse + 1 >= tab->allocated) { + size_t new_size = tab->allocated * 2; + gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); + if (p == NULL) + overflo(__func__); + f->gototab[state].allocated = new_size; + f->gototab[state].entries = p; + } + f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch; + f->gototab[state].entries[f->gototab[state].inuse-1].state = val; + f->gototab[state].inuse++; + return val; + } else { + // maybe we have it, maybe we don't + gtte key; + gtte *item; + + key.ch = ch; + key.state = 0; /* irrelevant */ + item = bsearch(& key, f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), + entry_cmp); + + if (item != NULL) { + // we have it, update state and return + item->state = val; + return item->state; } + // otherwise, fall through to insert and reallocate. } -#else - gtte key; - gtte *item, *p; - key.ch = ch; - key.state = 0; /* irrelevant */ - item = bsearch(& key, f->gototab[state].entries, - f->gototab[state].inuse, sizeof(gtte), - entry_cmp); - - if (item != NULL) { - item->state = val; - return item->state; - } -#endif gtt *tab = & f->gototab[state]; if (tab->inuse + 1 >= tab->allocated) { size_t new_size = tab->allocated * 2; - p = realloc(f->gototab[state].entries, new_size * sizeof(gtte)); + gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); if (p == NULL) overflo(__func__); f->gototab[state].allocated = new_size; -- cgit v1.2.3 From 25baaf87ddcd93aa8c5f9f0afa8abdd318d6273b Mon Sep 17 00:00:00 2001 From: ozan yigit Date: Sun, 15 Oct 2023 11:27:40 -0400 Subject: gototab reallocation pulled into resize_gototab --- b.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/b.c b/b.c index b8782b1..d665ff9 100644 --- a/b.c +++ b/b.c @@ -596,6 +596,16 @@ int member(int c, int *sarg) /* is c in s? */ return(0); } +static void resize_gototab(fa *f, int state) +{ + size_t new_size = f->gototab[state].allocated * 2; + gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); + if (p == NULL) + overflo(__func__); + f->gototab[state].allocated = new_size; + f->gototab[state].entries = p; +} + static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ { gtte key; @@ -632,16 +642,10 @@ static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplem return val; } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) { // not seen yet, insert and return - // FIXME: (Oz, hint, hint): Resizing should be pulled out into a function... gtt *tab = & f->gototab[state]; - if (tab->inuse + 1 >= tab->allocated) { - size_t new_size = tab->allocated * 2; - gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); - if (p == NULL) - overflo(__func__); - f->gototab[state].allocated = new_size; - f->gototab[state].entries = p; - } + if (tab->inuse + 1 >= tab->allocated) + resize_gototab(f, state); + f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch; f->gototab[state].entries[f->gototab[state].inuse-1].state = val; f->gototab[state].inuse++; @@ -666,14 +670,8 @@ static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplem } gtt *tab = & f->gototab[state]; - if (tab->inuse + 1 >= tab->allocated) { - size_t new_size = tab->allocated * 2; - gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); - if (p == NULL) - overflo(__func__); - f->gototab[state].allocated = new_size; - f->gototab[state].entries = p; - } + if (tab->inuse + 1 >= tab->allocated) + resize_gototab(f, state); ++tab->inuse; f->gototab[state].entries[tab->inuse].ch = ch; f->gototab[state].entries[tab->inuse].state = val; -- cgit v1.2.3 From 58dba2799941de8bdd486fcc3dde1b0ad8812c50 Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Mon, 16 Oct 2023 11:23:22 +0300 Subject: Remove limit on character classes from README.md, now that code is fixed. --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index daace23..84fb06e 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,6 @@ Aribtrary characters may be included with `\u` followed by 1 to 8 hexadecimal di ### Regular expressions ### Regular expressions may include UTF-8 code points, including `\u`. -Character classes are likely to be limited to about 256 characters -when expanded. ### CSV ### @@ -145,4 +143,4 @@ is not at the top of our priority list. #### Last Updated -Sun 15 Oct 2023 06:28:36 IDT +Mon 16 Oct 2023 11:23:08 IDT -- cgit v1.2.3 From 11b2b7b6d5c42ea63f79ce1c1d88264f83cc2155 Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Wed, 25 Oct 2023 15:26:30 +0300 Subject: Add a missing free of f->gototab. --- b.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/b.c b/b.c index d665ff9..7ca1e4f 100644 --- a/b.c +++ b/b.c @@ -1578,7 +1578,8 @@ void freefa(fa *f) /* free a finite automaton */ if (f == NULL) return; for (i = 0; i < f->state_count; i++) - xfree(f->gototab[i].entries) + xfree(f->gototab[i].entries); + xfree(f->gototab); for (i = 0; i <= f->curstat; i++) xfree(f->posns[i]); for (i = 0; i <= f->accept; i++) { -- cgit v1.2.3 From 144b14f99420a9ac9a63d177c11e99bec8b45b64 Mon Sep 17 00:00:00 2001 From: Miguel Pineiro Jr Date: Mon, 6 Nov 2023 22:12:36 -0500 Subject: Consolidate sub and gsub in dosub In addition to the deduplicative benefits, several uses-after-free are fixed: function d() { delete a } BEGIN { sub(/./, d(), a[0]) } function d() { delete a } BEGIN { sub(d(), "repl", a[0]) } function d() { delete a } BEGIN { gsub(/./, d(), a[0]) } function d() { delete a } BEGIN { gsub(d(), "repl", a[0]) } When not matching globally, dosub breaks the loop immediately after the lone replacement (a significant speedup in some cases). --- maketab.c | 4 +- proto.h | 3 +- run.c | 270 ++++++++++++++++++++++++++++---------------------------------- 3 files changed, 125 insertions(+), 152 deletions(-) diff --git a/maketab.c b/maketab.c index 433541e..3747efa 100644 --- a/maketab.c +++ b/maketab.c @@ -52,8 +52,8 @@ struct xx { ARRAY, "array", NULL }, { INDIRECT, "indirect", "$(" }, { SUBSTR, "substr", "substr" }, - { SUB, "sub", "sub" }, - { GSUB, "gsub", "gsub" }, + { SUB, "dosub", "sub" }, + { GSUB, "dosub", "gsub" }, { INDEX, "sindex", "sindex" }, { SPRINTF, "awksprintf", "sprintf " }, { ADD, "arith", " + " }, diff --git a/proto.h b/proto.h index cb4988e..ed63e78 100644 --- a/proto.h +++ b/proto.h @@ -196,8 +196,7 @@ extern FILE *openfile(int, const char *, bool *); extern const char *filename(FILE *); extern Cell *closefile(Node **, int); extern void closeall(void); -extern Cell *sub(Node **, int); -extern Cell *gsub(Node **, int); +extern Cell *dosub(Node **, int); extern FILE *popen(const char *, const char *); extern int pclose(FILE *); diff --git a/run.c b/run.c index a9ef242..e78cae9 100644 --- a/run.c +++ b/run.c @@ -2397,169 +2397,143 @@ static void flush_all(void) void backsub(char **pb_ptr, const char **sptr_ptr); -Cell *sub(Node **a, int nnn) /* substitute command */ +Cell *dosub(Node **a, int subop) /* sub and gsub */ { - const char *sptr, *q; - Cell *x, *y, *result; - char *t, *buf, *pb; fa *pfa; + int tempstat; + char *repl; + Cell *x; + + char *buf = NULL; + char *pb = NULL; int bufsz = recsize; - if ((buf = (char *) malloc(bufsz)) == NULL) - FATAL("out of memory in sub"); - x = execute(a[3]); /* target string */ - t = getsval(x); - if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */ - pfa = (fa *) a[1]; /* regular expression */ - else { - y = execute(a[1]); - pfa = makedfa(getsval(y), 1); - tempfree(y); + const char *r, *s; + const char *start; + const char *noempty = NULL; /* empty match disallowed here */ + size_t m = 0; /* match count */ + size_t whichm; /* which match to select, 0 = global */ + int mtype; /* match type */ + + if (a[0] == NULL) { /* 0 => a[1] is already-compiled regexpr */ + pfa = (fa *) a[1]; + } else { + x = execute(a[1]); + pfa = makedfa(getsval(x), 1); + tempfree(x); } - y = execute(a[2]); /* replacement string */ - result = False; - if (pmatch(pfa, t)) { - sptr = t; - adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub"); - pb = buf; - while (sptr < patbeg) - *pb++ = *sptr++; - sptr = getsval(y); - while (*sptr != '\0') { - adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub"); - if (*sptr == '\\') { - backsub(&pb, &sptr); - } else if (*sptr == '&') { - sptr++; - adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub"); - for (q = patbeg; q < patbeg+patlen; ) - *pb++ = *q++; - } else - *pb++ = *sptr++; + + x = execute(a[2]); /* replacement string */ + repl = tostring(getsval(x)); + tempfree(x); + + switch (subop) { + case SUB: + whichm = 1; + x = execute(a[3]); /* source string */ + break; + case GSUB: + whichm = 0; + x = execute(a[3]); /* source string */ + break; + default: + FATAL("dosub: unrecognized subop: %d", subop); + } + + start = getsval(x); + while (pmatch(pfa, start)) { + if (buf == NULL) { + if ((pb = buf = malloc(bufsz)) == NULL) + FATAL("out of memory in dosub"); + tempstat = pfa->initstat; + pfa->initstat = 2; } - *pb = '\0'; - if (pb > buf + bufsz) - FATAL("sub result1 %.30s too big; can't happen", buf); - sptr = patbeg + patlen; - if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) { - adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub"); - while ((*pb++ = *sptr++) != '\0') - continue; + + /* match types */ + #define MT_IGNORE 0 /* unselected or invalid */ + #define MT_INSERT 1 /* selected, empty */ + #define MT_REPLACE 2 /* selected, not empty */ + + /* an empty match just after replacement is invalid */ + + if (patbeg == noempty && patlen == 0) { + mtype = MT_IGNORE; /* invalid, not counted */ + } else if (whichm == ++m || whichm == 0) { + mtype = patlen ? MT_REPLACE : MT_INSERT; + } else { + mtype = MT_IGNORE; /* unselected, but counted */ } - if (pb > buf + bufsz) - FATAL("sub result2 %.30s too big; can't happen", buf); - setsval(x, buf); /* BUG: should be able to avoid copy */ - result = True; - } - tempfree(x); - tempfree(y); - free(buf); - return result; -} -Cell *gsub(Node **a, int nnn) /* global substitute */ -{ - Cell *x, *y; - char *rptr, *pb; - const char *q, *t, *sptr; - char *buf; - fa *pfa; - int mflag, tempstat, num; - int bufsz = recsize; - int charlen = 0; + /* leading text: */ + if (patbeg > start) { + adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start), + recsize, &pb, "dosub"); + s = start; + while (s < patbeg) + *pb++ = *s++; + } - if ((buf = (char *) malloc(bufsz)) == NULL) - FATAL("out of memory in gsub"); - mflag = 0; /* if mflag == 0, can replace empty string */ - num = 0; - x = execute(a[3]); /* target string */ - t = getsval(x); - if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */ - pfa = (fa *) a[1]; /* regular expression */ - else { - y = execute(a[1]); - pfa = makedfa(getsval(y), 1); - tempfree(y); - } - y = execute(a[2]); /* replacement string */ - if (pmatch(pfa, t)) { - tempstat = pfa->initstat; - pfa->initstat = 2; - pb = buf; - rptr = getsval(y); - do { - if (patlen == 0 && *patbeg != '\0') { /* matched empty string */ - if (mflag == 0) { /* can replace empty */ - num++; - sptr = rptr; - while (*sptr != '\0') { - adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub"); - if (*sptr == '\\') { - backsub(&pb, &sptr); - } else if (*sptr == '&') { - sptr++; - adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub"); - for (q = patbeg; q < patbeg+patlen; ) - *pb++ = *q++; - } else - *pb++ = *sptr++; - } - } - if (*t == '\0') /* at end */ - goto done; - adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub"); - charlen = u8_nextlen(t); - while (charlen-- > 0) - *pb++ = *t++; - if (pb > buf + bufsz) /* BUG: not sure of this test */ - FATAL("gsub result0 %.30s too big; can't happen", buf); - mflag = 0; + if (mtype == MT_IGNORE) + goto matching_text; /* skip replacement text */ + + r = repl; + while (*r != 0) { + adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub"); + if (*r == '\\') { + backsub(&pb, &r); + } else if (*r == '&') { + r++; + adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, + &pb, "dosub"); + for (s = patbeg; s < patbeg+patlen; ) + *pb++ = *s++; + } else { + *pb++ = *r++; } - else { /* matched nonempty string */ - num++; - sptr = t; - adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub"); - while (sptr < patbeg) - *pb++ = *sptr++; - sptr = rptr; - while (*sptr != '\0') { - adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub"); - if (*sptr == '\\') { - backsub(&pb, &sptr); - } else if (*sptr == '&') { - sptr++; - adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub"); - for (q = patbeg; q < patbeg+patlen; ) - *pb++ = *q++; - } else - *pb++ = *sptr++; - } - t = patbeg + patlen; - if (patlen == 0 || *t == '\0' || *(t-1) == '\0') - goto done; - if (pb > buf + bufsz) - FATAL("gsub result1 %.30s too big; can't happen", buf); - mflag = 1; - } - } while (pmatch(pfa,t)); - sptr = t; - adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub"); - while ((*pb++ = *sptr++) != '\0') - continue; - done: if (pb < buf + bufsz) - *pb = '\0'; - else if (*(pb-1) != '\0') - FATAL("gsub result2 %.30s truncated; can't happen", buf); - setsval(x, buf); /* BUG: should be able to avoid copy + free */ + } + +matching_text: + if (mtype == MT_REPLACE || *patbeg == '\0') + goto next_search; /* skip matching text */ + + if (patlen == 0) + patlen = u8_nextlen(patbeg); + adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub"); + s = patbeg; + while (s < patbeg + patlen) + *pb++ = *s++; + +next_search: + start = patbeg + patlen; + if (m == whichm || *patbeg == '\0') + break; + if (mtype == MT_REPLACE) + noempty = start; + + #undef MT_IGNORE + #undef MT_INSERT + #undef MT_REPLACE + } + + xfree(repl); + + if (buf != NULL) { pfa->initstat = tempstat; + + /* trailing text */ + adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub"); + while ((*pb++ = *start++) != '\0') + ; + + setsval(x, buf); + free(buf); } + tempfree(x); - tempfree(y); x = gettemp(); x->tval = NUM; - x->fval = num; - free(buf); - return(x); + x->fval = m; + return x; } void backsub(char **pb_ptr, const char **sptr_ptr) /* handle \\& variations */ -- cgit v1.2.3 From 6e222fe5c0df10f406dd61a9ae587a1d86bd5d3c Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Sat, 11 Nov 2023 20:28:43 +0200 Subject: Fix two incorrect test cases. --- testdir/T.csv | 1 - testdir/T.flags | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/testdir/T.csv b/testdir/T.csv index 10da1ea..79c1510 100755 --- a/testdir/T.csv +++ b/testdir/T.csv @@ -77,5 +77,4 @@ a''b [a''b] a, [a][] "", [][] , [][] -a"b [a"b] !!!! diff --git a/testdir/T.flags b/testdir/T.flags index 33d7c8d..17ce561 100755 --- a/testdir/T.flags +++ b/testdir/T.flags @@ -20,5 +20,6 @@ grep 'unknown option' foo >/dev/null || echo 'T.flags: bad unknown option' $awk -F >foo 2>&1 grep 'no field separator' foo >/dev/null || echo 'T.flags: bad missing field separator' -$awk -F '' >foo 2>&1 -grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator' +### Awk is now like gawk and splits into separate characters if FS = "" +# $awk -F '' >foo 2>&1 +# grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator' -- cgit v1.2.3 From 9acc5109412df07a977cdedb4537ebd90a38da9f Mon Sep 17 00:00:00 2001 From: ozan yigit Date: Wed, 15 Nov 2023 15:24:08 -0500 Subject: man page update for 8-bit locales. thanks arnold --- awk.1 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/awk.1 b/awk.1 index 40ff0d3..ef40a01 100644 --- a/awk.1 +++ b/awk.1 @@ -586,6 +586,9 @@ the syntax is worse. .PP Input is expected to be UTF-8 encoded. Other multibyte character sets are not handled. +However, in eight-bit locales, +.I awk +treats each input byte as a separate character. .SH UNUSUAL FLOATING-POINT VALUES .I Awk was designed before IEEE 754 arithmetic defined Not-A-Number (NaN) -- cgit v1.2.3 From e3c63b9e62e404ad7aa2ac3780503e0a6ace6d2f Mon Sep 17 00:00:00 2001 From: ozan yigit Date: Wed, 15 Nov 2023 15:25:20 -0500 Subject: replace gcc with cc --- bugs-fixed/REGRESS | 2 +- makefile | 8 ++++---- testdir/Compare.tt | 2 +- testdir/REGRESS | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bugs-fixed/REGRESS b/bugs-fixed/REGRESS index 0716003..98d578a 100755 --- a/bugs-fixed/REGRESS +++ b/bugs-fixed/REGRESS @@ -1,4 +1,4 @@ -#! /bin/bash +#! /bin/sh if [ ! -f ../a.out ] then diff --git a/makefile b/makefile index df966ef..b47a8af 100644 --- a/makefile +++ b/makefile @@ -28,10 +28,10 @@ CFLAGS = CFLAGS = -O2 # compiler options -#CC = gcc -Wall -g -Wwrite-strings -#CC = gcc -O4 -Wall -pedantic -fno-strict-aliasing -#CC = gcc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov -HOSTCC = gcc -g -Wall -pedantic -Wcast-qual +#CC = cc -Wall -g -Wwrite-strings +#CC = cc -O4 -Wall -pedantic -fno-strict-aliasing +#CC = cc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov +HOSTCC = cc -g -Wall -pedantic -Wcast-qual CC = $(HOSTCC) # change this is cross-compiling. # By fiat, to make our lives easier, yacc is now defined to be bison. diff --git a/testdir/Compare.tt b/testdir/Compare.tt index ca828d2..4b297d7 100755 --- a/testdir/Compare.tt +++ b/testdir/Compare.tt @@ -4,7 +4,7 @@ oldawk=${oldawk-awk} awk=${awk-../a.out} echo compiling time.c -gcc time.c -o time +cc time.c -o time time=./time echo time command = $time diff --git a/testdir/REGRESS b/testdir/REGRESS index 5c3667f..b54ce3f 100755 --- a/testdir/REGRESS +++ b/testdir/REGRESS @@ -1,7 +1,7 @@ #!/bin/sh uname -a -gcc echo.c -o echo && echo echo compiled +cc echo.c -o echo && echo echo compiled oldawk=${oldawk-awk} awk=${awk-../a.out} -- cgit v1.2.3 From 904af5677d1886c947d0cd78fa63f739a8c9a394 Mon Sep 17 00:00:00 2001 From: Miguel Pineiro Jr Date: Wed, 15 Nov 2023 12:52:20 -0500 Subject: Fix fnematch utf8 support --- b.c | 148 ++++++++++++++++++++++++++------------------------------------------ 1 file changed, 56 insertions(+), 92 deletions(-) diff --git a/b.c b/b.c index aa07d59..1669bcc 100644 --- a/b.c +++ b/b.c @@ -759,59 +759,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */ #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long -// Read one rune at a time from the given FILE*. Return both -// the bytes and the actual rune. - -struct runedata { - int rune; - size_t len; - char bytes[6]; -}; - -struct runedata getrune(FILE *fp) -{ - struct runedata result; - int c, next; - - memset(&result, 0, sizeof(result)); - - c = getc(fp); - if (c == EOF) - return result; // result.rune == 0 --> EOF - else if (c < 128 || awk_mb_cur_max == 1) { - result.bytes[0] = c; - result.len = 1; - result.rune = c; - - return result; - } - - // need to get bytes and fill things in - result.bytes[0] = c; - result.len = 1; - - next = 1; - for (int i = 1; i < MAX_UTF_BYTES; i++) { - c = getc(fp); - if (c == EOF) - break; - result.bytes[next++] = c; - result.len++; - } - - // put back any extra input bytes - int actual_len = u8_nextlen(result.bytes); - while (result.len > actual_len) { - ungetc(result.bytes[--result.len], fp); - } - - result.bytes[result.len] = '\0'; - (void) u8_rune(& result.rune, (uschar *) result.bytes); - - return result; -} - - /* * NAME * fnematch @@ -829,58 +776,76 @@ struct runedata getrune(FILE *fp) bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) { - char *buf = *pbuf; + char *i, *j, *k, *buf = *pbuf; int bufsize = *pbufsize; - int i, j, k, ns, s; - struct runedata r; + int c, n, ns, s; s = pfa->initstat; patlen = 0; /* - * All indices relative to buf. - * i <= j <= k <= bufsize + * buf <= i <= j <= k <= buf+bufsize * - * i: origin of active substring (first byte of first character) - * j: current character (last byte of current character) - * k: destination of next getc() + * i: origin of active substring + * j: current character + * k: destination of the next getc */ - i = -1, k = 0; - do { - j = i++; - do { - r = getrune(f); - if ((++j + r.len) >= k) { - if (k >= bufsize) - if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) - FATAL("stream '%.30s...' too long", buf); - } - memcpy(buf + k, r.bytes, r.len); - j += r.len - 1; // incremented next time around the loop - k += r.len; - if ((ns = get_gototab(pfa, s, r.rune)) != 0) - s = ns; - else - s = cgoto(pfa, s, r.rune); + i = j = k = buf; - if (pfa->out[s]) { /* final state */ - patlen = j - i + 1; - if (r.rune == 0) /* don't count $ */ - patlen--; + do { + /* + * Call u8_rune with at least MAX_UTF_BYTES ahead in + * the buffer until EOF interferes. + */ + if (k - j < MAX_UTF_BYTES) { + if (k + MAX_UTF_BYTES > buf + bufsize) { + adjbuf((char **) &buf, &bufsize, + bufsize + MAX_UTF_BYTES, + quantum, 0, "fnematch"); + } + for (n = MAX_UTF_BYTES ; n > 0; n--) { + *k++ = (c = getc(f)) != EOF ? c : 0; + if (c == EOF) { + if (ferror(f)) + FATAL("fnematch: getc error"); + break; + } } - } while (buf[j] && s != 1); + } + + j += u8_rune(&c, (uschar *)j); + + if ((ns = get_gototab(pfa, s, c)) != 0) + s = ns; + else + s = cgoto(pfa, s, c); + + if (pfa->out[s]) { /* final state */ + patbeg = i; + patlen = j - i; + if (c == 0) /* don't count $ */ + patlen--; + } + + if (c && s != 1) + continue; /* origin i still viable, next j */ + if (patlen) + break; /* best match found */ + + /* no match at origin i, next i and start over */ + i += u8_rune(&c, (uschar *)i); + if (c == 0) + break; /* no match */ + j = i; s = 2; - if (r.len > 1) - i += r.len - 1; // i incremented around the loop - } while (buf[i] && !patlen); + } while (1); /* adjbuf() may have relocated a resized buffer. Inform the world. */ *pbuf = buf; *pbufsize = bufsize; if (patlen) { - patbeg = (char *) buf + i; /* * Under no circumstances is the last character fed to * the automaton part of the match. It is EOF's nullbyte, @@ -893,11 +858,10 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) * terminate the buffer. */ do - for (int ii = r.len; ii > 0; ii--) - if (buf[--k] && ungetc(buf[k], f) == EOF) - FATAL("unable to ungetc '%c'", buf[k]); - while (k > i + patlen); - buf[k] = '\0'; + if (*--k && ungetc(*k, f) == EOF) + FATAL("unable to ungetc '%c'", *k); + while (k > patbeg + patlen); + *k = '\0'; return true; } else -- cgit v1.2.3 From f8fb1503a2137625dae68844404456e09e996cee Mon Sep 17 00:00:00 2001 From: ozan yigit Date: Thu, 16 Nov 2023 14:45:58 -0500 Subject: updated for the latest fixes --- FIXES | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/FIXES b/FIXES index a13ca50..bc59f69 100644 --- a/FIXES +++ b/FIXES @@ -25,6 +25,12 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the second edition of the AWK book was published in September 2023. +Nov 15, 2023 + Man page edit, regression test fixes. thanks to Arnold Robbins + consolidation of sub and gsub into dosub, removing duplicate + code. thanks to Miguel Pineiro Jr. + gcc replaced with cc everywhere. + Oct 30, 2023: multiple fixes and a minor code cleanup. disabled utf-8 for non-multibyte locales, such as C or POSIX. -- cgit v1.2.3 From 9e254e503f844e122870e9488db3d7b0233e554c Mon Sep 17 00:00:00 2001 From: ozan yigit Date: Thu, 16 Nov 2023 14:56:49 -0500 Subject: adjusted version date. --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index 3a205c8..5f07419 100644 --- a/main.c +++ b/main.c @@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20231030"; +const char *version = "version 20231116"; #define DEBUG #include -- cgit v1.2.3 From 5e82404389a0486bad6221ecd00e714fa3e02630 Mon Sep 17 00:00:00 2001 From: ozan yigit Date: Mon, 20 Nov 2023 10:47:31 -0500 Subject: updated for the latest fixes --- FIXES | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/FIXES b/FIXES index bc59f69..5d2b459 100644 --- a/FIXES +++ b/FIXES @@ -25,10 +25,16 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the second edition of the AWK book was published in September 2023. +Nov 20, 2023 + rewrite of fnematch to fix a number of issues, including + extraneous output, out-of-bounds access, number of bytes + to push back after a failed match etc. + thanks to Miguel Pineiro Jr. + Nov 15, 2023 Man page edit, regression test fixes. thanks to Arnold Robbins consolidation of sub and gsub into dosub, removing duplicate - code. thanks to Miguel Pineiro Jr. + code. thanks to Miguel Pineiro Jr. gcc replaced with cc everywhere. Oct 30, 2023: -- cgit v1.2.3 From ad4249ec70c04690a797a8d8310b1d4ee69a357b Mon Sep 17 00:00:00 2001 From: ozan yigit Date: Mon, 20 Nov 2023 10:48:24 -0500 Subject: adjusted version date. --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index 5f07419..4f2d78a 100644 --- a/main.c +++ b/main.c @@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20231116"; +const char *version = "version 20231120"; #define DEBUG #include -- cgit v1.2.3 From ad444edf7b9d72d9c1025764cb76079b6cc84661 Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Thu, 23 Nov 2023 19:13:04 +0200 Subject: Integrate fix from Issue #169. --- lex.c | 14 +++++++++----- testdir/T.misc | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/lex.c b/lex.c index 675c116..0473a33 100644 --- a/lex.c +++ b/lex.c @@ -421,8 +421,12 @@ int string(void) { int i; + if (!isxdigit(peek())) { + unput(c); + break; + } n = 0; - for (i = 1; i <= 2; i++) { + for (i = 0; i < 2; i++) { c = input(); if (c == 0) break; @@ -433,13 +437,13 @@ int string(void) n += (c - '0'); else n += 10 + (c - 'a'); - } else + } else { + unput(c); break; + } } - if (n) + if (i) *bp++ = n; - else - unput(c); break; } diff --git a/testdir/T.misc b/testdir/T.misc index 1e5c3c5..a2c0fb8 100755 --- a/testdir/T.misc +++ b/testdir/T.misc @@ -510,3 +510,17 @@ cmp -s foo1 foo2 || echo 'BAD: T.misc exit status on I/O error' echo 1b >foo1 echo ab | $awk '{ sub(/a/, "b" ~ /b/); print }' >foo2 cmp -s foo1 foo2 || echo 'BAD: T.misc lexer regex buffer clobbered' + ++# Check handling of octal (\OOO) and hex (\xHH) esc. seqs. in strings. ++echo 'hello888 ++hello ++hello ++helloxGOO ++hello ++0A' > foo1 ++$awk 'BEGIN { print "hello\888" }' > foo2 ++$awk 'BEGIN { print "hello\x000A" }' >> foo2 ++$awk 'BEGIN { printf "hello\x0A" }' >> foo2 ++$awk 'BEGIN { print "hello\xGOO" }' >> foo2 ++$awk 'BEGIN { print "hello\x0A0A" }' >> foo2 ++cmp -s foo1 foo2 || echo '�BAD: T.misc escape sequences in strings mishandled' -- cgit v1.2.3 From b84620a7b5c89e84ec76821ba3fd31d70b29ee7b Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Thu, 23 Nov 2023 19:20:45 +0200 Subject: Fix issue #147. Update FIXES and main.c version. --- FIXES | 11 ++++++++--- main.c | 2 +- run.c | 3 ++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/FIXES b/FIXES index 5d2b459..e84bd44 100644 --- a/FIXES +++ b/FIXES @@ -25,13 +25,19 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the second edition of the AWK book was published in September 2023. -Nov 20, 2023 +Nov 23, 2023: + Fix Issue #169, related to escape sequences in strings. + Thanks to Github user rajeevvp. + Fix Issue #147, reported by Github user drawkula, and fixed + by Miguel Pineiro Jr. + +Nov 20, 2023: rewrite of fnematch to fix a number of issues, including extraneous output, out-of-bounds access, number of bytes to push back after a failed match etc. thanks to Miguel Pineiro Jr. -Nov 15, 2023 +Nov 15, 2023: Man page edit, regression test fixes. thanks to Arnold Robbins consolidation of sub and gsub into dosub, removing duplicate code. thanks to Miguel Pineiro Jr. @@ -44,7 +50,6 @@ Oct 30, 2023: systems. also fixed an out-of-bounds read for empty CCL. fixed a buffer overflow in substr with utf-8 strings. many thanks to Todd C Miller. - Sep 24, 2023: fnematch and getrune have been overhauled to solve issues around diff --git a/main.c b/main.c index 4f2d78a..a5f49c6 100644 --- a/main.c +++ b/main.c @@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20231120"; +const char *version = "version 20231123"; #define DEBUG #include diff --git a/run.c b/run.c index e78cae9..7462c38 100644 --- a/run.c +++ b/run.c @@ -1540,8 +1540,9 @@ Cell *assign(Node **a, int n) /* a[0] = a[1], a[0] += a[1], etc. */ if (x == y && !(x->tval & (FLD|REC)) && x != nfloc) ; /* self-assignment: leave alone unless it's a field or NF */ else if ((y->tval & (STR|NUM)) == (STR|NUM)) { + yf = getfval(y); setsval(x, getsval(y)); - x->fval = getfval(y); + x->fval = yf; x->tval |= NUM; } else if (isstr(y)) -- cgit v1.2.3 From 0da7e5331dce8d2c74b5de473a0659d4e2fb30a2 Mon Sep 17 00:00:00 2001 From: ozan yigit Date: Thu, 23 Nov 2023 14:14:44 -0500 Subject: cleanup of the new test for issue #169 --- testdir/T.misc | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/testdir/T.misc b/testdir/T.misc index a2c0fb8..b8ed3c1 100755 --- a/testdir/T.misc +++ b/testdir/T.misc @@ -511,16 +511,16 @@ echo 1b >foo1 echo ab | $awk '{ sub(/a/, "b" ~ /b/); print }' >foo2 cmp -s foo1 foo2 || echo 'BAD: T.misc lexer regex buffer clobbered' -+# Check handling of octal (\OOO) and hex (\xHH) esc. seqs. in strings. -+echo 'hello888 -+hello -+hello -+helloxGOO -+hello -+0A' > foo1 -+$awk 'BEGIN { print "hello\888" }' > foo2 -+$awk 'BEGIN { print "hello\x000A" }' >> foo2 -+$awk 'BEGIN { printf "hello\x0A" }' >> foo2 -+$awk 'BEGIN { print "hello\xGOO" }' >> foo2 -+$awk 'BEGIN { print "hello\x0A0A" }' >> foo2 -+cmp -s foo1 foo2 || echo '�BAD: T.misc escape sequences in strings mishandled' +# Check handling of octal \OOO and hex \xHH esc. seqs. in strings. +echo 'hello888 +hello +hello +helloxGOO +hello +0A' > foo1 +$awk 'BEGIN { print "hello\888" }' > foo2 +$awk 'BEGIN { print "hello\x000A" }' >> foo2 +$awk 'BEGIN { printf "hello\x0A" }' >> foo2 +$awk 'BEGIN { print "hello\xGOO" }' >> foo2 +$awk 'BEGIN { print "hello\x0A0A" }' >> foo2 +cmp -s foo1 foo2 || echo '�BAD: T.misc escape sequences in strings mishandled' -- cgit v1.2.3 From 1c424b1b4d5931f2a4b87b6f0f75827bc53ff9d7 Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Fri, 24 Nov 2023 10:49:41 +0200 Subject: Initialize realloced memory to zero. --- b.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/b.c b/b.c index 4bc43bf..881c052 100644 --- a/b.c +++ b/b.c @@ -602,7 +602,12 @@ static void resize_gototab(fa *f, int state) gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); if (p == NULL) overflo(__func__); - f->gototab[state].allocated = new_size; + + // need to initialized the new memory to zero + size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size + memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out + + f->gototab[state].allocated = new_size; // update gotottab info f->gototab[state].entries = p; } -- cgit v1.2.3 From 08544f61c0b92b0f4ab0963369b1b2d1c8a82db2 Mon Sep 17 00:00:00 2001 From: ozan yigit Date: Fri, 24 Nov 2023 08:33:38 -0500 Subject: fix issue #199: gototab improvements --- FIXES | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/FIXES b/FIXES index e84bd44..52f49e3 100644 --- a/FIXES +++ b/FIXES @@ -25,6 +25,11 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the second edition of the AWK book was published in September 2023. +Nov 24, 2023: + Fix issue #199: gototab improvements to dynamically resize the + table, qsort and bsearch to improve the lookup speed as the + table gets larger for multibyte input. thanks to Arnold Robbins. + Nov 23, 2023: Fix Issue #169, related to escape sequences in strings. Thanks to Github user rajeevvp. -- cgit v1.2.3 From fbd1d5b712e27a9bb527e39ed6e9bf3b9afbb1df Mon Sep 17 00:00:00 2001 From: ozan yigit Date: Fri, 24 Nov 2023 08:34:47 -0500 Subject: adjust version date. --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index a5f49c6..c478e32 100644 --- a/main.c +++ b/main.c @@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20231123"; +const char *version = "version 20231124"; #define DEBUG #include -- cgit v1.2.3