1 files changed, 1086 insertions, 0 deletions
diff --git a/maint/ucptest.c b/maint/ucptest.c
new file mode 100644
index 00000000..6faead30
--- /dev/null
+++ b/maint/ucptest.c
@@ -0,0 +1,1086 @@
+/***************************************************
+* A program for testing the Unicode property table *
+***************************************************/
+
+/* Copyright (c) University of Cambridge 2008-2022 */
+
+/* Compile thus:
+
+   gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
+     ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
+
+   Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
+   support in pcre2test.
+*/
+
+/* This is a hacked-up program for testing the Unicode properties tables of
+PCRE2. It can also be used for finding characters with certain properties. I
+wrote it to help with debugging, and have added things that I found useful, in
+a rather haphazard way. The code has never been seriously tidied or checked for
+robustness, but it shouldn't now give compiler warnings.
+
+There is only one option: "-s". If given, it applies only to the "findprop"
+command. It causes the UTF-8 sequence of bytes that encode the character to be
+output between angle brackets at the end of the line. On a UTF-8 terminal, this
+will show the appropriate graphic for the code point.
+
+If the command has arguments, they are concatenated into a buffer, separated by
+spaces. If the first argument starts "U+" or consists entirely of hexadecimal
+digits, "findprop" is inserted at the start. The buffer is then processed as a
+single line file, after which the program exits. If there are no arguments, the
+program reads commands line by line on stdin and writes output to stdout. The
+return code is always zero.
+
+There are three commands:
+
+The command "findprop" must be followed by a space-separated list of Unicode
+code points as hex numbers, either without any prefix or starting with "U+", or
+as individual UTF-8 characters preceded by '+'. For example:
+
+  findprop U+1234 5Abc +?
+
+The output is one long line per character, listing Unicode properties that have
+values, followed by its other case or cases if one or more exist, followed by
+its Script Extension list if there is one. This list is in square brackets. A
+second list in square brackets gives all the Boolean properties of the
+character. The properties that come first are:
+
+  Bidi class          e.g. NSM (most common is L)
+  General type        e.g. Letter
+  Specific type       e.g. Upper case letter
+  Script              e.g. Medefaidrin
+  Grapheme break type e.g. Extend (most common is Other)
+
+Script names and Boolean property names are all in lower case, with underscores
+and hyphens removed, because that's how they are stored for "loose" matching.
+
+The command "find" must be followed by a list of property types and their
+values. The values are case-sensitive, except for bidi class. This finds
+characters that have those properties. If multiple properties are listed, they
+must all be matched. Currently supported:
+
+  script <name>    The character must have this script property. Only one
+                     such script may be given.
+  scriptx <name>   This script must be in the character's Script Extension
+                     property list. If this is used many times, all the given
+                     scripts must be present.
+  type <abbrev>    The character's specific type (e.g. Lu or Nd) must match.
+  gbreak <name>    The grapheme break property must match.
+  bidi <class>     The character's bidi class must match.
+  bool <name>      The character's Boolean property list must contain this
+                     property.
+
+If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
+Script Extensions and Boolean properties, there may be a mixture of positive
+and negative requirements. All must be satisfied.
+
+Sequences of two or more characters are shown as ranges, for example
+U+0041..U+004A. No more than 100 lines are are output. If there are more
+characters, the list ends with ...
+
+The command "list" must be followed by one of property names script, bool,
+type, gbreak or bidi. The defined values for that property are listed. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "../src/config.h"
+#endif
+
+#ifndef SUPPORT_UNICODE
+#define SUPPORT_UNICODE
+#endif
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../src/pcre2_internal.h"
+#include "../src/pcre2_ucp.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+#if defined(SUPPORT_LIBREADLINE)
+#include <readline/readline.h>
+#include <readline/history.h>
+#else
+#if defined(HAVE_EDITLINE_READLINE_H)
+#include <editline/readline.h>
+#else
+#include <readline/readline.h>
+#ifdef RL_VERSION_MAJOR
+#include <readline/history.h>
+#endif
+#endif
+#endif
+#endif
+
+
+/* -------------------------------------------------------------------*/
+
+#define CS   (char *)
+#define CCS  (const char *)
+#define CSS  (char **)
+#define US   (unsigned char *)
+#define CUS  (const unsigned char *)
+#define USS  (unsigned char **)
+
+/* -------------------------------------------------------------------*/
+
+static BOOL show_character = FALSE;
+
+static const unsigned char *type_names[] = {
+  US"Cc", US"Control",
+  US"Cf", US"Format",
+  US"Cn", US"Unassigned",
+  US"Co", US"Private use",
+  US"Cs", US"Surrogate",
+  US"Ll", US"Lower case letter",
+  US"Lm", US"Modifier letter",
+  US"Lo", US"Other letter",
+  US"Lt", US"Title case letter",
+  US"Lu", US"Upper case letter",
+  US"Mc", US"Spacing mark",
+  US"Me", US"Enclosing mark",
+  US"Mn", US"Non-spacing mark",
+  US"Nd", US"Decimal number",
+  US"Nl", US"Letter number",
+  US"No", US"Other number",
+  US"Pc", US"Connector punctuation",
+  US"Pd", US"Dash punctuation",
+  US"Pe", US"Close punctuation",
+  US"Pf", US"Final punctuation",
+  US"Pi", US"Initial punctuation",
+  US"Po", US"Other punctuation",
+  US"Ps", US"Open punctuation",
+  US"Sc", US"Currency symbol",
+  US"Sk", US"Modifier symbol",
+  US"Sm", US"Mathematical symbol",
+  US"So", US"Other symbol",
+  US"Zl", US"Line separator",
+  US"Zp", US"Paragraph separator",
+  US"Zs", US"Space separator"
+};
+
+static const unsigned char *gb_names[] = {
+  US"CR",                    US"carriage return",
+  US"LF",                    US"linefeed",
+  US"Control",               US"",
+  US"Extend",                US"",
+  US"Prepend",               US"",
+  US"SpacingMark",           US"",
+  US"L",                     US"Hangul syllable type L",
+  US"V",                     US"Hangul syllable type V",
+  US"T",                     US"Hangul syllable type T",
+  US"LV",                    US"Hangul syllable type LV",
+  US"LVT",                   US"Hangul syllable type LVT",
+  US"Regional_Indicator",    US"",
+  US"Other",                 US"",
+  US"ZWJ",                   US"zero width joiner",
+  US"Extended_Pictographic", US""
+};
+
+static const unsigned char *bd_names[] = {
+  US"AL",   US"Arabic letter",
+  US"AN",   US"Arabid number",
+  US"B",    US"Paragraph separator",
+  US"BN",   US"Boundary neutral",
+  US"CS",   US"Common separator",
+  US"EN",   US"European number",
+  US"ES",   US"European separator",
+  US"ET",   US"European terminator",
+  US"FSI",  US"First string isolate",
+  US"L",    US"Left-to-right",
+  US"LRE",  US"Left-to-right embedding",
+  US"LRI",  US"Left-to-right isolate",
+  US"LRO",  US"Left-to-right override",
+  US"NSM",  US"Non-spacing mark",
+  US"ON",   US"Other neutral",
+  US"PDF",  US"Pop directional format",
+  US"PDI",  US"Pop directional isolate",
+  US"R",    US"Right-to-left",
+  US"RLE",  US"Right-to-left embedding",
+  US"RLI",  US"Right-to-left isolate",
+  US"RLO",  US"Right-to-left override",
+  US"S",    US"Segment separator",
+  US"WS",   US"White space"
+};
+
+static const unsigned int utf8_table1[] = {
+  0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
+
+static const int utf8_table2[] = {
+  0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+
+/* Macro to pick up the remaining bytes of a UTF-8 character, advancing
+the pointer. */
+
+#define GETUTF8INC(c, eptr) \
+    { \
+    if ((c & 0x20u) == 0) \
+      c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
+    else if ((c & 0x10u) == 0) \
+      { \
+      c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
+      eptr += 2; \
+      } \
+    else if ((c & 0x08u) == 0) \
+      { \
+      c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
+          ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
+      eptr += 3; \
+      } \
+    else if ((c & 0x04u) == 0) \
+      { \
+      c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
+          ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
+          (eptr[3] & 0x3fu); \
+      eptr += 4; \
+      } \
+    else \
+      { \
+      c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
+          ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
+          ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
+      eptr += 5; \
+      } \
+    }
+
+
+
+/*************************************************
+*       Convert character value to UTF-8         *
+*************************************************/
+
+/* This function takes an unsigned long integer value in the range 0 -
+0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
+
+Arguments:
+  cvalue     the character value
+  buffer     pointer to buffer for result - at least 6 bytes long
+
+Returns:     number of bytes placed in the buffer
+             0 if input code point is too big
+*/
+
+static size_t
+ord2utf8(unsigned int cvalue, unsigned char *buffer)
+{
+size_t i, j;
+for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
+  if (cvalue <= utf8_table1[i]) break;
+if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
+buffer += i;
+for (j = i; j > 0; j--)
+ {
+ *buffer-- = 0x80 | (cvalue & 0x3f);
+ cvalue >>= 6;
+ }
+*buffer = utf8_table2[i] | cvalue;
+return i + 1;
+}
+
+
+
+/*************************************************
+*             Test for interaction               *
+*************************************************/
+
+static BOOL
+is_stdin_tty(void)
+{
+#if defined WIN32
+return _isatty(_fileno(stdin));
+#else
+return isatty(fileno(stdin));
+#endif
+}
+
+
+/*************************************************
+*            Get  name from ucp ident            *
+*************************************************/
+
+/* The utt table contains both full names and abbreviations. So search for both
+and use the longer if two are found, unless the first one is only 3 characters
+and we are looking for a script (some scripts have 3-character names). If this
+were not just a test program it might be worth making some kind of reverse
+index. */
+
+static const char *
+get_propname(int prop, int type)
+{
+size_t i, j, len;
+size_t foundlist[2];
+const char *yield;
+int typex = (type == PT_SC)? PT_SCX : type;
+
+j = 0;
+for (i = 0; i < PRIV(utt_size); i++)
+  {
+  const ucp_type_table *u = PRIV(utt) + i;
+  if ((u->type == type || u->type == typex) && u->value == prop)
+    {
+    foundlist[j++] = i;
+    if (j >= 2) break;
+    }
+  }
+  
+if (j == 0) return "??";
+
+yield = NULL;
+len = 0;
+
+for (i = 0; i < j; i++)
+  {
+  const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
+  size_t sl = strlen(s);
+
+  if (sl > len)
+    {
+    yield = s;
+    if (sl == 3 && type == PT_SC) break;
+    len = sl;
+    }
+  }
+
+return yield;
+}
+
+
+/*************************************************
+*      Print Unicode property info for a char    *
+*************************************************/
+
+static void
+print_prop(unsigned int c, BOOL is_just_one)
+{
+int type = UCD_CATEGORY(c);
+int fulltype = UCD_CHARTYPE(c);
+int script = UCD_SCRIPT(c);
+int scriptx = UCD_SCRIPTX(c);
+int gbprop = UCD_GRAPHBREAK(c);
+int bidi = UCD_BIDICLASS(c);
+unsigned int othercase = UCD_OTHERCASE(c);
+int caseset = UCD_CASESET(c);
+int bprops = UCD_BPROPS(c);
+
+const unsigned char *fulltypename = US"??";
+const unsigned char *typename = US"??";
+const unsigned char *graphbreak = US"??";
+const unsigned char *bidiclass = US"??";
+const unsigned char *scriptname = CUS get_propname(script, PT_SC);
+
+switch (type)
+  {
+  case ucp_C: typename = US"Control"; break;
+  case ucp_L: typename = US"Letter"; break;
+  case ucp_M: typename = US"Mark"; break;
+  case ucp_N: typename = US"Number"; break;
+  case ucp_P: typename = US"Punctuation"; break;
+  case ucp_S: typename = US"Symbol"; break;
+  case ucp_Z: typename = US"Separator"; break;
+  }
+
+switch (fulltype)
+  {
+  case ucp_Cc: fulltypename = US"Control"; break;
+  case ucp_Cf: fulltypename = US"Format"; break;
+  case ucp_Cn: fulltypename = US"Unassigned"; break;
+  case ucp_Co: fulltypename = US"Private use"; break;
+  case ucp_Cs: fulltypename = US"Surrogate"; break;
+  case ucp_Ll: fulltypename = US"Lower case letter"; break;
+  case ucp_Lm: fulltypename = US"Modifier letter"; break;
+  case ucp_Lo: fulltypename = US"Other letter"; break;
+  case ucp_Lt: fulltypename = US"Title case letter"; break;
+  case ucp_Lu: fulltypename = US"Upper case letter"; break;
+  case ucp_Mc: fulltypename = US"Spacing mark"; break;
+  case ucp_Me: fulltypename = US"Enclosing mark"; break;
+  case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
+  case ucp_Nd: fulltypename = US"Decimal number"; break;
+  case ucp_Nl: fulltypename = US"Letter number"; break;
+  case ucp_No: fulltypename = US"Other number"; break;
+  case ucp_Pc: fulltypename = US"Connector punctuation"; break;
+  case ucp_Pd: fulltypename = US"Dash punctuation"; break;
+  case ucp_Pe: fulltypename = US"Close punctuation"; break;
+  case ucp_Pf: fulltypename = US"Final punctuation"; break;
+  case ucp_Pi: fulltypename = US"Initial punctuation"; break;
+  case ucp_Po: fulltypename = US"Other punctuation"; break;
+  case ucp_Ps: fulltypename = US"Open punctuation"; break;
+  case ucp_Sc: fulltypename = US"Currency symbol"; break;
+  case ucp_Sk: fulltypename = US"Modifier symbol"; break;
+  case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
+  case ucp_So: fulltypename = US"Other symbol"; break;
+  case ucp_Zl: fulltypename = US"Line separator"; break;
+  case ucp_Zp: fulltypename = US"Paragraph separator"; break;
+  case ucp_Zs: fulltypename = US"Space separator"; break;
+  }
+
+switch(gbprop)
+  {
+  case ucp_gbCR:           graphbreak = US"CR"; break;
+  case ucp_gbLF:           graphbreak = US"LF"; break;
+  case ucp_gbControl:      graphbreak = US"Control"; break;
+  case ucp_gbExtend:       graphbreak = US"Extend"; break;
+  case ucp_gbPrepend:      graphbreak = US"Prepend"; break;
+  case ucp_gbSpacingMark:  graphbreak = US"SpacingMark"; break;
+  case ucp_gbL:            graphbreak = US"Hangul syllable type L"; break;
+  case ucp_gbV:            graphbreak = US"Hangul syllable type V"; break;
+  case ucp_gbT:            graphbreak = US"Hangul syllable type T"; break;
+  case ucp_gbLV:           graphbreak = US"Hangul syllable type LV"; break;
+  case ucp_gbLVT:          graphbreak = US"Hangul syllable type LVT"; break;
+  case ucp_gbRegional_Indicator:
+                           graphbreak = US"Regional Indicator"; break;
+  case ucp_gbOther:        graphbreak = US"Other"; break;
+  case ucp_gbZWJ:          graphbreak = US"Zero Width Joiner"; break;
+  case ucp_gbExtended_Pictographic:
+                           graphbreak = US"Extended Pictographic"; break;
+  default:                 graphbreak = US"Unknown"; break;
+  }
+
+switch(bidi)
+  {
+  case ucp_bidiAL:   bidiclass = US"AL "; break;
+  case ucp_bidiFSI:  bidiclass = US"FSI"; break;
+  case ucp_bidiL:    bidiclass = US"L  "; break;
+  case ucp_bidiLRE:  bidiclass = US"LRE"; break;
+  case ucp_bidiLRI:  bidiclass = US"LRI"; break;
+  case ucp_bidiLRO:  bidiclass = US"LRO"; break;
+  case ucp_bidiPDF:  bidiclass = US"PDF"; break;
+  case ucp_bidiPDI:  bidiclass = US"PDI"; break;
+  case ucp_bidiR:    bidiclass = US"R  "; break;
+  case ucp_bidiRLE:  bidiclass = US"RLE"; break;
+  case ucp_bidiRLI:  bidiclass = US"RLI"; break;
+  case ucp_bidiRLO:  bidiclass = US"RLO"; break;
+  case ucp_bidiAN:   bidiclass = US"AN "; break;
+  case ucp_bidiB:    bidiclass = US"B  "; break;
+  case ucp_bidiBN:   bidiclass = US"BN "; break;
+  case ucp_bidiCS:   bidiclass = US"CS "; break;
+  case ucp_bidiEN:   bidiclass = US"EN "; break;
+  case ucp_bidiES:   bidiclass = US"ES "; break;
+  case ucp_bidiET:   bidiclass = US"ET "; break;
+  case ucp_bidiNSM:  bidiclass = US"NSM"; break;
+  case ucp_bidiON:   bidiclass = US"ON "; break;
+  case ucp_bidiS:    bidiclass = US"S  "; break;
+  case ucp_bidiWS:   bidiclass = US"WS "; break;
+  default:           bidiclass = US"???"; break;
+  }
+
+printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
+  scriptname, graphbreak);
+
+if (is_just_one && othercase != c)
+  {
+  printf(", U+%04X", othercase);
+  if (caseset != 0)
+    {
+    const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
+    while (*(++p) < NOTACHAR)
+      {
+      unsigned int d = *p;
+      if (d != othercase && d != c) printf(", U+%04X", d);
+      }
+    }
+  }
+
+if (scriptx != 0)
+  {
+  const char *sep = "";
+  const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
+  printf(", [");
+  for (int i = 0; i < ucp_Unknown; i++)
+  if (MAPBIT(p, i) != 0)
+    {
+    printf("%s%s", sep, get_propname(i, PT_SC));
+    sep = ", ";
+    }
+  printf("]");
+  }
+
+if (bprops != 0)
+  {
+  const char *sep = "";
+  const uint32_t *p = PRIV(ucd_boolprop_sets) + 
+    bprops * ucd_boolprop_sets_item_size;
+  printf(", [");
+  for (int i = 0; i < ucp_Bprop_Count; i++)
+  if (MAPBIT(p, i) != 0)
+    {
+    printf("%s%s", sep, get_propname(i, PT_BOOL));
+    sep = ", ";
+    }
+  printf("]");
+  }
+
+if (show_character && is_just_one)
+  {
+  unsigned char buffer[8];
+  size_t len = ord2utf8(c, buffer);
+  printf(", >%.*s<", (int)len, buffer);
+  }
+
+printf("\n");
+}
+
+
+
+/*************************************************
+*   Find character(s) with given property/ies    *
+*************************************************/
+
+static void
+find_chars(unsigned char *s)
+{
+unsigned char name[128];
+unsigned char value[128];
+unsigned char *t;
+unsigned int count= 0;
+int scriptx_list[128];
+unsigned int scriptx_count = 0;
+int bprop_list[128];
+unsigned int bprop_count = 0;
+uint32_t i, c;
+int script = -1;
+int type = -1;
+int gbreak = -1;
+int bidiclass = -1;
+BOOL script_not = FALSE;
+BOOL type_not = FALSE;
+BOOL gbreak_not = FALSE;
+BOOL bidiclass_not = FALSE;
+BOOL hadrange = FALSE;
+const ucd_record *ucd, *next_ucd;
+const char *pad = "        ";
+
+while (*s != 0)
+  {
+  unsigned int offset = 0;
+  BOOL scriptx_not = FALSE;
+
+  for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
+  *t = 0;
+  while (isspace(*s)) s++;
+
+  for (t = value; *s != 0 && !isspace(*s); s++) 
+    {
+    if (*s != '_' && *s != '-') *t++ = *s;
+    } 
+  *t = 0;
+  while (isspace(*s)) s++;
+
+  if (strcmp(CS name, "script") == 0 ||
+      strcmp(CS name, "scriptx") == 0)
+    {
+    for (t = value; *t != 0; t++) *t = tolower(*t);
+ 
+    if (value[0] == '!')
+      {
+      if (name[6] == 'x') scriptx_not = TRUE;
+        else script_not = TRUE;
+      offset = 1;
+      }
+
+    for (i = 0; i < PRIV(utt_size); i++)
+      {
+      const ucp_type_table *u = PRIV(utt) + i;
+      if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
+            PRIV(utt_names) + u->name_offset) == 0)
+        {
+        c = u->value;
+        if (name[6] == 'x')
+          {
+          scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
+          }
+        else
+          {
+          if (script < 0) script = c; else
+            {
+            printf("** Only 1 script value allowed\n");
+            return;
+            }
+          }
+        break;
+        }
+      }
+
+    if (i >= PRIV(utt_size))
+      {
+      printf("** Unrecognized script name \"%s\"\n", value);
+      return;
+      }
+    }
+
+  else if (strcmp(CS name, "bool") == 0)
+    {
+    int not = 1;
+    if (value[0] == '!')
+      {
+      not = -1;
+      offset = 1;
+      }
+
+    for (i = 0; i < PRIV(utt_size); i++)
+      {
+      const ucp_type_table *u = PRIV(utt) + i;
+      if (u->type == PT_BOOL && strcmp(CS(value + offset),
+            PRIV(utt_names) + u->name_offset) == 0)
+        {
+        bprop_list[bprop_count++] = u->value * not;
+        break;
+        }
+      }
+
+    if (i >= PRIV(utt_size))
+      {
+      printf("** Unrecognized property name \"%s\"\n", value);
+      return;
+      }
+    }
+
+  else if (strcmp(CS name, "type") == 0)
+    {
+    if (type >= 0)
+      {
+      printf("** Only 1 type value allowed\n");
+      return;
+      }
+    else
+      {
+      if (value[0] == '!')
+        {
+        type_not = TRUE;
+        offset = 1;
+        }
+
+      for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
+        {
+        if (strcmp(CS (value + offset), CS type_names[i]) == 0)
+          {
+          type = i/2;
+          break;
+          }
+        }
+      if (i >= sizeof(type_names)/sizeof(char *))
+        {
+        printf("** Unrecognized type name \"%s\"\n", value);
+        return;
+        }
+      }
+    }
+
+  else if (strcmp(CS name, "gbreak") == 0)
+    {
+    if (gbreak >= 0)
+      {
+      printf("** Only 1 grapheme break value allowed\n");
+      return;
+      }
+    else
+      {
+      if (value[0] == '!')
+        {
+        gbreak_not = TRUE;
+        offset = 1;
+        }
+
+      for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
+        {
+        if (strcmp(CS (value + offset), CS gb_names[i]) == 0)
+          {
+          gbreak = i/2;
+          break;
+          }
+        }
+      if (i >= sizeof(gb_names)/sizeof(char *))
+        {
+        printf("** Unrecognized gbreak name \"%s\"\n", value);
+        return;
+        }
+      }
+    }
+
+  else if (strcmp(CS name, "bidi") == 0 ||
+           strcmp(CS name, "bidiclass") == 0 ||
+           strcmp(CS name, "bidi_class") == 0 )
+    {
+    if (bidiclass >= 0)
+      {
+      printf("** Only 1 bidi class value allowed\n");
+      return;
+      }
+    else
+      {
+      if (value[0] == '!')
+        {
+        bidiclass_not = TRUE;
+        offset = 1;
+        }
+      for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
+        {
+        if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0)
+          {
+          bidiclass = i/2;
+          break;
+          }
+        }
+      if (i >= sizeof(bd_names)/sizeof(char *))
+        {
+        printf("** Unrecognized bidi class name \"%s\"\n", value);
+        return;
+        }
+      }
+    }
+
+  else
+    {
+    printf("** Unrecognized property name \"%s\"\n", name);
+    return;
+    }
+  }
+
+if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
+    gbreak < 0 && bidiclass < 0)
+  {
+  printf("** No properties specified\n");
+  return;
+  }
+
+for (c = 0; c <= 0x10ffff; c++)
+  {
+  if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
+
+  if (scriptx_count > 0)
+    {
+    const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
+    unsigned int found = 0;
+
+    for (i = 0; i < scriptx_count; i++)
+      {
+      int x = scriptx_list[i]/32;
+      int y = scriptx_list[i]%32;
+
+      /* Positive requirment */
+      if (scriptx_list[i] >= 0)
+        {
+        if ((bits_scriptx[x] & (1u<<y)) != 0) found++;
+        }
+      /* Negative requirement */
+      else
+        {
+        if ((bits_scriptx[x] & (1u<<y)) == 0) found++;
+        }
+      }
+
+    if (found != scriptx_count) continue;
+    }
+
+  if (bprop_count > 0)
+    {
+    const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) + 
+      UCD_BPROPS(c) * ucd_boolprop_sets_item_size;
+    unsigned int found = 0;
+
+    for (i = 0; i < bprop_count; i++)
+      {
+      int x = bprop_list[i]/32;
+      int y = bprop_list[i]%32;
+
+      /* Positive requirement */
+      if (bprop_list[i] >= 0)
+        {
+        if ((bits_bprop[x] & (1u<<y)) != 0) found++;
+        }
+      /* Negative requirement */
+      else
+        {
+        if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
+        }
+      }
+
+    if (found != bprop_count) continue;
+    }
+
+  if (type >= 0)
+    {
+    if (type_not)
+      {
+      if (type == UCD_CHARTYPE(c)) continue;
+      }
+    else
+      {
+      if (type != UCD_CHARTYPE(c)) continue;
+      }
+    }
+
+  if (gbreak >= 0)
+    {
+    if (gbreak_not)
+      {
+      if (gbreak == UCD_GRAPHBREAK(c)) continue;
+      }
+    else
+      {
+      if (gbreak != UCD_GRAPHBREAK(c)) continue;
+      }
+    }
+
+  if (bidiclass >= 0)
+    {
+    if (bidiclass_not)
+      {
+      if (bidiclass == UCD_BIDICLASS(c)) continue;
+      }
+    else
+      {
+      if (bidiclass != UCD_BIDICLASS(c)) continue;
+      }
+    }
+
+  /* All conditions are met. Look for runs. */
+
+  ucd = GET_UCD(c);
+
+  for (i = c + 1; i < 0x10ffff; i++)
+    {
+    next_ucd = GET_UCD(i);
+    if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
+    }
+
+  if (--i > c)
+    {
+    printf("U+%04X..", c);
+    c = i;
+    hadrange = TRUE;
+    }
+  else if (hadrange) printf("%s", pad);
+
+  print_prop(c, FALSE);
+  if (c >= 0x100000) pad = "        ";
+    else if (c >= 0x10000) pad = "       ";
+  count++;
+  if (count >= 100)
+    {
+    printf("...\n");
+    break;
+    }
+  }
+
+if (count == 0) printf("No characters found\n");
+}
+
+
+/*************************************************
+*        Process command line                    *
+*************************************************/
+
+static void
+process_command_line(unsigned char *buffer)
+{
+unsigned char *s, *t;
+unsigned char name[24];
+
+s = buffer;
+while (isspace(*s)) s++;
+if (*s == 0) return;
+
+for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
+*t = 0;
+while (isspace(*s)) s++;
+
+if (strcmp(CS name, "findprop") == 0)
+  {
+  while (*s != 0)
+    {
+    unsigned int c;
+    unsigned char *endptr;
+    t = s;
+
+    if (*t == '+')
+      {
+      c = *(++t);
+      if (c > 0x7fu)
+        {
+        GETCHARINC(c, t);
+        }
+      endptr = t+1;
+      }
+    else
+      {
+      if (strncmp(CS t, "U+", 2) == 0) t += 2;
+      c = strtoul(CS t, CSS(&endptr), 16);
+      }
+
+    if (*endptr != 0 && !isspace(*endptr))
+      {
+      while (*endptr != 0 && !isspace(*endptr)) endptr++;
+      printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
+      }
+    else
+      {
+      if (c > 0x10ffff)
+        printf("** U+%x is too big for a Unicode code point\n", c);
+      else
+        print_prop(c, TRUE);
+      }
+    s = endptr;
+    while (isspace(*s)) s++;
+    }
+  }
+
+else if (strcmp(CS name, "find") == 0)
+  {
+  find_chars(s);
+  }
+
+else if (strcmp(CS name, "list") == 0)
+  {
+  while (*s != 0)
+    {
+    size_t i;
+    for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
+    *t = 0;
+    while (isspace(*s)) s++;
+
+    if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
+      {
+      for (i = 0; i < PRIV(utt_size); i++)
+        if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
+          printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
+      }
+
+    else if (strcmp(CS name, "bool") == 0)
+      {
+      for (i = 0; i < PRIV(utt_size); i++)
+        if (PRIV(utt)[i].type == PT_BOOL)
+          printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
+      }
+
+    else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
+      {
+      for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
+        printf("%s %s\n", type_names[i], type_names[i+1]);
+      }
+
+    else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
+      {
+      for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
+        {
+        if (gb_names[i+1][0] != 0)
+          printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
+        else
+          printf("%s\n", gb_names[i]);
+        }
+      }
+
+    else if (strcmp(CS name, "bidi") == 0 ||
+             strcmp(CS name, "bidiclasses") == 0)
+      {
+      for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
+        printf("%3s %s\n", bd_names[i], bd_names[i+1]);
+      }
+
+    else
+      {
+      printf("** Unknown property \"%s\"\n", name);
+      break;
+      }
+    }
+  }
+
+else printf("** Unknown test command \"%s\"\n", name);
+}
+
+
+
+/*************************************************
+*               Main program                     *
+*************************************************/
+
+int
+main(int argc, char **argv)
+{
+BOOL interactive;
+int first_arg = 1;
+unsigned char buffer[1024];
+
+if (argc > 1 && strcmp(argv[1], "-s") == 0)
+  {
+  show_character = TRUE;
+  first_arg++;
+  }
+
+if (argc > first_arg)
+  {
+  int i;
+  BOOL datafirst = TRUE;
+  char *arg = argv[first_arg];
+  unsigned char *s = buffer;
+
+  if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
+    {
+    while (*arg != 0)
+      {
+      if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
+      }
+    }
+
+  if (datafirst)
+    {
+    strcpy(CS s, "findprop ");
+    s += 9;
+    }
+
+  for (i = first_arg; i < argc; i++)
+    {
+    s += sprintf(CS s, "%s ", argv[i]);
+    }
+
+  process_command_line(buffer);
+  return 0;
+  }
+
+interactive = is_stdin_tty();
+
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+if (interactive) using_history();
+#endif
+
+for(;;)
+  {
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+  if (interactive)
+    {
+    size_t len;
+    unsigned char *s = US readline("> ");
+    if (s == NULL) break;
+    len = strlen(CS s);
+    if (len > 0) add_history(CS s);
+    memcpy(buffer, s, len);
+    buffer[len] = '\n';
+    buffer[len+1] = 0;
+    free(s);
+    }
+  else
+#endif
+
+    {
+    if (interactive) printf("> ");
+    if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
+    if (!interactive) printf("%s", buffer);
+    }
+
+  process_command_line(buffer);
+  }
+
+if (interactive) printf("\n");
+
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+if (interactive) clear_history();
+#endif
+
+return 0;
+}
+
+/* End */