aboutsummaryrefslogtreecommitdiff
path: root/maint/ucptest.c
diff options
context:
space:
mode:
Diffstat (limited to 'maint/ucptest.c')
-rw-r--r--maint/ucptest.c1086
1 files changed, 1086 insertions, 0 deletions
diff --git a/maint/ucptest.c b/maint/ucptest.c
new file mode 100644
index 00000000..6faead30
--- /dev/null
+++ b/maint/ucptest.c
@@ -0,0 +1,1086 @@
+/***************************************************
+* A program for testing the Unicode property table *
+***************************************************/
+
+/* Copyright (c) University of Cambridge 2008-2022 */
+
+/* Compile thus:
+
+ gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
+ ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
+
+ Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
+ support in pcre2test.
+*/
+
+/* This is a hacked-up program for testing the Unicode properties tables of
+PCRE2. It can also be used for finding characters with certain properties. I
+wrote it to help with debugging, and have added things that I found useful, in
+a rather haphazard way. The code has never been seriously tidied or checked for
+robustness, but it shouldn't now give compiler warnings.
+
+There is only one option: "-s". If given, it applies only to the "findprop"
+command. It causes the UTF-8 sequence of bytes that encode the character to be
+output between angle brackets at the end of the line. On a UTF-8 terminal, this
+will show the appropriate graphic for the code point.
+
+If the command has arguments, they are concatenated into a buffer, separated by
+spaces. If the first argument starts "U+" or consists entirely of hexadecimal
+digits, "findprop" is inserted at the start. The buffer is then processed as a
+single line file, after which the program exits. If there are no arguments, the
+program reads commands line by line on stdin and writes output to stdout. The
+return code is always zero.
+
+There are three commands:
+
+The command "findprop" must be followed by a space-separated list of Unicode
+code points as hex numbers, either without any prefix or starting with "U+", or
+as individual UTF-8 characters preceded by '+'. For example:
+
+ findprop U+1234 5Abc +?
+
+The output is one long line per character, listing Unicode properties that have
+values, followed by its other case or cases if one or more exist, followed by
+its Script Extension list if there is one. This list is in square brackets. A
+second list in square brackets gives all the Boolean properties of the
+character. The properties that come first are:
+
+ Bidi class e.g. NSM (most common is L)
+ General type e.g. Letter
+ Specific type e.g. Upper case letter
+ Script e.g. Medefaidrin
+ Grapheme break type e.g. Extend (most common is Other)
+
+Script names and Boolean property names are all in lower case, with underscores
+and hyphens removed, because that's how they are stored for "loose" matching.
+
+The command "find" must be followed by a list of property types and their
+values. The values are case-sensitive, except for bidi class. This finds
+characters that have those properties. If multiple properties are listed, they
+must all be matched. Currently supported:
+
+ script <name> The character must have this script property. Only one
+ such script may be given.
+ scriptx <name> This script must be in the character's Script Extension
+ property list. If this is used many times, all the given
+ scripts must be present.
+ type <abbrev> The character's specific type (e.g. Lu or Nd) must match.
+ gbreak <name> The grapheme break property must match.
+ bidi <class> The character's bidi class must match.
+ bool <name> The character's Boolean property list must contain this
+ property.
+
+If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
+Script Extensions and Boolean properties, there may be a mixture of positive
+and negative requirements. All must be satisfied.
+
+Sequences of two or more characters are shown as ranges, for example
+U+0041..U+004A. No more than 100 lines are are output. If there are more
+characters, the list ends with ...
+
+The command "list" must be followed by one of property names script, bool,
+type, gbreak or bidi. The defined values for that property are listed. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "../src/config.h"
+#endif
+
+#ifndef SUPPORT_UNICODE
+#define SUPPORT_UNICODE
+#endif
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../src/pcre2_internal.h"
+#include "../src/pcre2_ucp.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+#if defined(SUPPORT_LIBREADLINE)
+#include <readline/readline.h>
+#include <readline/history.h>
+#else
+#if defined(HAVE_EDITLINE_READLINE_H)
+#include <editline/readline.h>
+#else
+#include <readline/readline.h>
+#ifdef RL_VERSION_MAJOR
+#include <readline/history.h>
+#endif
+#endif
+#endif
+#endif
+
+
+/* -------------------------------------------------------------------*/
+
+#define CS (char *)
+#define CCS (const char *)
+#define CSS (char **)
+#define US (unsigned char *)
+#define CUS (const unsigned char *)
+#define USS (unsigned char **)
+
+/* -------------------------------------------------------------------*/
+
+static BOOL show_character = FALSE;
+
+static const unsigned char *type_names[] = {
+ US"Cc", US"Control",
+ US"Cf", US"Format",
+ US"Cn", US"Unassigned",
+ US"Co", US"Private use",
+ US"Cs", US"Surrogate",
+ US"Ll", US"Lower case letter",
+ US"Lm", US"Modifier letter",
+ US"Lo", US"Other letter",
+ US"Lt", US"Title case letter",
+ US"Lu", US"Upper case letter",
+ US"Mc", US"Spacing mark",
+ US"Me", US"Enclosing mark",
+ US"Mn", US"Non-spacing mark",
+ US"Nd", US"Decimal number",
+ US"Nl", US"Letter number",
+ US"No", US"Other number",
+ US"Pc", US"Connector punctuation",
+ US"Pd", US"Dash punctuation",
+ US"Pe", US"Close punctuation",
+ US"Pf", US"Final punctuation",
+ US"Pi", US"Initial punctuation",
+ US"Po", US"Other punctuation",
+ US"Ps", US"Open punctuation",
+ US"Sc", US"Currency symbol",
+ US"Sk", US"Modifier symbol",
+ US"Sm", US"Mathematical symbol",
+ US"So", US"Other symbol",
+ US"Zl", US"Line separator",
+ US"Zp", US"Paragraph separator",
+ US"Zs", US"Space separator"
+};
+
+static const unsigned char *gb_names[] = {
+ US"CR", US"carriage return",
+ US"LF", US"linefeed",
+ US"Control", US"",
+ US"Extend", US"",
+ US"Prepend", US"",
+ US"SpacingMark", US"",
+ US"L", US"Hangul syllable type L",
+ US"V", US"Hangul syllable type V",
+ US"T", US"Hangul syllable type T",
+ US"LV", US"Hangul syllable type LV",
+ US"LVT", US"Hangul syllable type LVT",
+ US"Regional_Indicator", US"",
+ US"Other", US"",
+ US"ZWJ", US"zero width joiner",
+ US"Extended_Pictographic", US""
+};
+
+static const unsigned char *bd_names[] = {
+ US"AL", US"Arabic letter",
+ US"AN", US"Arabid number",
+ US"B", US"Paragraph separator",
+ US"BN", US"Boundary neutral",
+ US"CS", US"Common separator",
+ US"EN", US"European number",
+ US"ES", US"European separator",
+ US"ET", US"European terminator",
+ US"FSI", US"First string isolate",
+ US"L", US"Left-to-right",
+ US"LRE", US"Left-to-right embedding",
+ US"LRI", US"Left-to-right isolate",
+ US"LRO", US"Left-to-right override",
+ US"NSM", US"Non-spacing mark",
+ US"ON", US"Other neutral",
+ US"PDF", US"Pop directional format",
+ US"PDI", US"Pop directional isolate",
+ US"R", US"Right-to-left",
+ US"RLE", US"Right-to-left embedding",
+ US"RLI", US"Right-to-left isolate",
+ US"RLO", US"Right-to-left override",
+ US"S", US"Segment separator",
+ US"WS", US"White space"
+};
+
+static const unsigned int utf8_table1[] = {
+ 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
+
+static const int utf8_table2[] = {
+ 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+
+/* Macro to pick up the remaining bytes of a UTF-8 character, advancing
+the pointer. */
+
+#define GETUTF8INC(c, eptr) \
+ { \
+ if ((c & 0x20u) == 0) \
+ c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
+ else if ((c & 0x10u) == 0) \
+ { \
+ c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
+ eptr += 2; \
+ } \
+ else if ((c & 0x08u) == 0) \
+ { \
+ c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
+ ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
+ eptr += 3; \
+ } \
+ else if ((c & 0x04u) == 0) \
+ { \
+ c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
+ ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
+ (eptr[3] & 0x3fu); \
+ eptr += 4; \
+ } \
+ else \
+ { \
+ c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
+ ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
+ ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
+ eptr += 5; \
+ } \
+ }
+
+
+
+/*************************************************
+* Convert character value to UTF-8 *
+*************************************************/
+
+/* This function takes an unsigned long integer value in the range 0 -
+0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
+
+Arguments:
+ cvalue the character value
+ buffer pointer to buffer for result - at least 6 bytes long
+
+Returns: number of bytes placed in the buffer
+ 0 if input code point is too big
+*/
+
+static size_t
+ord2utf8(unsigned int cvalue, unsigned char *buffer)
+{
+size_t i, j;
+for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
+ if (cvalue <= utf8_table1[i]) break;
+if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
+buffer += i;
+for (j = i; j > 0; j--)
+ {
+ *buffer-- = 0x80 | (cvalue & 0x3f);
+ cvalue >>= 6;
+ }
+*buffer = utf8_table2[i] | cvalue;
+return i + 1;
+}
+
+
+
+/*************************************************
+* Test for interaction *
+*************************************************/
+
+static BOOL
+is_stdin_tty(void)
+{
+#if defined WIN32
+return _isatty(_fileno(stdin));
+#else
+return isatty(fileno(stdin));
+#endif
+}
+
+
+/*************************************************
+* Get name from ucp ident *
+*************************************************/
+
+/* The utt table contains both full names and abbreviations. So search for both
+and use the longer if two are found, unless the first one is only 3 characters
+and we are looking for a script (some scripts have 3-character names). If this
+were not just a test program it might be worth making some kind of reverse
+index. */
+
+static const char *
+get_propname(int prop, int type)
+{
+size_t i, j, len;
+size_t foundlist[2];
+const char *yield;
+int typex = (type == PT_SC)? PT_SCX : type;
+
+j = 0;
+for (i = 0; i < PRIV(utt_size); i++)
+ {
+ const ucp_type_table *u = PRIV(utt) + i;
+ if ((u->type == type || u->type == typex) && u->value == prop)
+ {
+ foundlist[j++] = i;
+ if (j >= 2) break;
+ }
+ }
+
+if (j == 0) return "??";
+
+yield = NULL;
+len = 0;
+
+for (i = 0; i < j; i++)
+ {
+ const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
+ size_t sl = strlen(s);
+
+ if (sl > len)
+ {
+ yield = s;
+ if (sl == 3 && type == PT_SC) break;
+ len = sl;
+ }
+ }
+
+return yield;
+}
+
+
+/*************************************************
+* Print Unicode property info for a char *
+*************************************************/
+
+static void
+print_prop(unsigned int c, BOOL is_just_one)
+{
+int type = UCD_CATEGORY(c);
+int fulltype = UCD_CHARTYPE(c);
+int script = UCD_SCRIPT(c);
+int scriptx = UCD_SCRIPTX(c);
+int gbprop = UCD_GRAPHBREAK(c);
+int bidi = UCD_BIDICLASS(c);
+unsigned int othercase = UCD_OTHERCASE(c);
+int caseset = UCD_CASESET(c);
+int bprops = UCD_BPROPS(c);
+
+const unsigned char *fulltypename = US"??";
+const unsigned char *typename = US"??";
+const unsigned char *graphbreak = US"??";
+const unsigned char *bidiclass = US"??";
+const unsigned char *scriptname = CUS get_propname(script, PT_SC);
+
+switch (type)
+ {
+ case ucp_C: typename = US"Control"; break;
+ case ucp_L: typename = US"Letter"; break;
+ case ucp_M: typename = US"Mark"; break;
+ case ucp_N: typename = US"Number"; break;
+ case ucp_P: typename = US"Punctuation"; break;
+ case ucp_S: typename = US"Symbol"; break;
+ case ucp_Z: typename = US"Separator"; break;
+ }
+
+switch (fulltype)
+ {
+ case ucp_Cc: fulltypename = US"Control"; break;
+ case ucp_Cf: fulltypename = US"Format"; break;
+ case ucp_Cn: fulltypename = US"Unassigned"; break;
+ case ucp_Co: fulltypename = US"Private use"; break;
+ case ucp_Cs: fulltypename = US"Surrogate"; break;
+ case ucp_Ll: fulltypename = US"Lower case letter"; break;
+ case ucp_Lm: fulltypename = US"Modifier letter"; break;
+ case ucp_Lo: fulltypename = US"Other letter"; break;
+ case ucp_Lt: fulltypename = US"Title case letter"; break;
+ case ucp_Lu: fulltypename = US"Upper case letter"; break;
+ case ucp_Mc: fulltypename = US"Spacing mark"; break;
+ case ucp_Me: fulltypename = US"Enclosing mark"; break;
+ case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
+ case ucp_Nd: fulltypename = US"Decimal number"; break;
+ case ucp_Nl: fulltypename = US"Letter number"; break;
+ case ucp_No: fulltypename = US"Other number"; break;
+ case ucp_Pc: fulltypename = US"Connector punctuation"; break;
+ case ucp_Pd: fulltypename = US"Dash punctuation"; break;
+ case ucp_Pe: fulltypename = US"Close punctuation"; break;
+ case ucp_Pf: fulltypename = US"Final punctuation"; break;
+ case ucp_Pi: fulltypename = US"Initial punctuation"; break;
+ case ucp_Po: fulltypename = US"Other punctuation"; break;
+ case ucp_Ps: fulltypename = US"Open punctuation"; break;
+ case ucp_Sc: fulltypename = US"Currency symbol"; break;
+ case ucp_Sk: fulltypename = US"Modifier symbol"; break;
+ case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
+ case ucp_So: fulltypename = US"Other symbol"; break;
+ case ucp_Zl: fulltypename = US"Line separator"; break;
+ case ucp_Zp: fulltypename = US"Paragraph separator"; break;
+ case ucp_Zs: fulltypename = US"Space separator"; break;
+ }
+
+switch(gbprop)
+ {
+ case ucp_gbCR: graphbreak = US"CR"; break;
+ case ucp_gbLF: graphbreak = US"LF"; break;
+ case ucp_gbControl: graphbreak = US"Control"; break;
+ case ucp_gbExtend: graphbreak = US"Extend"; break;
+ case ucp_gbPrepend: graphbreak = US"Prepend"; break;
+ case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break;
+ case ucp_gbL: graphbreak = US"Hangul syllable type L"; break;
+ case ucp_gbV: graphbreak = US"Hangul syllable type V"; break;
+ case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
+ case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
+ case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
+ case ucp_gbRegional_Indicator:
+ graphbreak = US"Regional Indicator"; break;
+ case ucp_gbOther: graphbreak = US"Other"; break;
+ case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break;
+ case ucp_gbExtended_Pictographic:
+ graphbreak = US"Extended Pictographic"; break;
+ default: graphbreak = US"Unknown"; break;
+ }
+
+switch(bidi)
+ {
+ case ucp_bidiAL: bidiclass = US"AL "; break;
+ case ucp_bidiFSI: bidiclass = US"FSI"; break;
+ case ucp_bidiL: bidiclass = US"L "; break;
+ case ucp_bidiLRE: bidiclass = US"LRE"; break;
+ case ucp_bidiLRI: bidiclass = US"LRI"; break;
+ case ucp_bidiLRO: bidiclass = US"LRO"; break;
+ case ucp_bidiPDF: bidiclass = US"PDF"; break;
+ case ucp_bidiPDI: bidiclass = US"PDI"; break;
+ case ucp_bidiR: bidiclass = US"R "; break;
+ case ucp_bidiRLE: bidiclass = US"RLE"; break;
+ case ucp_bidiRLI: bidiclass = US"RLI"; break;
+ case ucp_bidiRLO: bidiclass = US"RLO"; break;
+ case ucp_bidiAN: bidiclass = US"AN "; break;
+ case ucp_bidiB: bidiclass = US"B "; break;
+ case ucp_bidiBN: bidiclass = US"BN "; break;
+ case ucp_bidiCS: bidiclass = US"CS "; break;
+ case ucp_bidiEN: bidiclass = US"EN "; break;
+ case ucp_bidiES: bidiclass = US"ES "; break;
+ case ucp_bidiET: bidiclass = US"ET "; break;
+ case ucp_bidiNSM: bidiclass = US"NSM"; break;
+ case ucp_bidiON: bidiclass = US"ON "; break;
+ case ucp_bidiS: bidiclass = US"S "; break;
+ case ucp_bidiWS: bidiclass = US"WS "; break;
+ default: bidiclass = US"???"; break;
+ }
+
+printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
+ scriptname, graphbreak);
+
+if (is_just_one && othercase != c)
+ {
+ printf(", U+%04X", othercase);
+ if (caseset != 0)
+ {
+ const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
+ while (*(++p) < NOTACHAR)
+ {
+ unsigned int d = *p;
+ if (d != othercase && d != c) printf(", U+%04X", d);
+ }
+ }
+ }
+
+if (scriptx != 0)
+ {
+ const char *sep = "";
+ const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
+ printf(", [");
+ for (int i = 0; i < ucp_Unknown; i++)
+ if (MAPBIT(p, i) != 0)
+ {
+ printf("%s%s", sep, get_propname(i, PT_SC));
+ sep = ", ";
+ }
+ printf("]");
+ }
+
+if (bprops != 0)
+ {
+ const char *sep = "";
+ const uint32_t *p = PRIV(ucd_boolprop_sets) +
+ bprops * ucd_boolprop_sets_item_size;
+ printf(", [");
+ for (int i = 0; i < ucp_Bprop_Count; i++)
+ if (MAPBIT(p, i) != 0)
+ {
+ printf("%s%s", sep, get_propname(i, PT_BOOL));
+ sep = ", ";
+ }
+ printf("]");
+ }
+
+if (show_character && is_just_one)
+ {
+ unsigned char buffer[8];
+ size_t len = ord2utf8(c, buffer);
+ printf(", >%.*s<", (int)len, buffer);
+ }
+
+printf("\n");
+}
+
+
+
+/*************************************************
+* Find character(s) with given property/ies *
+*************************************************/
+
+static void
+find_chars(unsigned char *s)
+{
+unsigned char name[128];
+unsigned char value[128];
+unsigned char *t;
+unsigned int count= 0;
+int scriptx_list[128];
+unsigned int scriptx_count = 0;
+int bprop_list[128];
+unsigned int bprop_count = 0;
+uint32_t i, c;
+int script = -1;
+int type = -1;
+int gbreak = -1;
+int bidiclass = -1;
+BOOL script_not = FALSE;
+BOOL type_not = FALSE;
+BOOL gbreak_not = FALSE;
+BOOL bidiclass_not = FALSE;
+BOOL hadrange = FALSE;
+const ucd_record *ucd, *next_ucd;
+const char *pad = " ";
+
+while (*s != 0)
+ {
+ unsigned int offset = 0;
+ BOOL scriptx_not = FALSE;
+
+ for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
+ *t = 0;
+ while (isspace(*s)) s++;
+
+ for (t = value; *s != 0 && !isspace(*s); s++)
+ {
+ if (*s != '_' && *s != '-') *t++ = *s;
+ }
+ *t = 0;
+ while (isspace(*s)) s++;
+
+ if (strcmp(CS name, "script") == 0 ||
+ strcmp(CS name, "scriptx") == 0)
+ {
+ for (t = value; *t != 0; t++) *t = tolower(*t);
+
+ if (value[0] == '!')
+ {
+ if (name[6] == 'x') scriptx_not = TRUE;
+ else script_not = TRUE;
+ offset = 1;
+ }
+
+ for (i = 0; i < PRIV(utt_size); i++)
+ {
+ const ucp_type_table *u = PRIV(utt) + i;
+ if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
+ PRIV(utt_names) + u->name_offset) == 0)
+ {
+ c = u->value;
+ if (name[6] == 'x')
+ {
+ scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
+ }
+ else
+ {
+ if (script < 0) script = c; else
+ {
+ printf("** Only 1 script value allowed\n");
+ return;
+ }
+ }
+ break;
+ }
+ }
+
+ if (i >= PRIV(utt_size))
+ {
+ printf("** Unrecognized script name \"%s\"\n", value);
+ return;
+ }
+ }
+
+ else if (strcmp(CS name, "bool") == 0)
+ {
+ int not = 1;
+ if (value[0] == '!')
+ {
+ not = -1;
+ offset = 1;
+ }
+
+ for (i = 0; i < PRIV(utt_size); i++)
+ {
+ const ucp_type_table *u = PRIV(utt) + i;
+ if (u->type == PT_BOOL && strcmp(CS(value + offset),
+ PRIV(utt_names) + u->name_offset) == 0)
+ {
+ bprop_list[bprop_count++] = u->value * not;
+ break;
+ }
+ }
+
+ if (i >= PRIV(utt_size))
+ {
+ printf("** Unrecognized property name \"%s\"\n", value);
+ return;
+ }
+ }
+
+ else if (strcmp(CS name, "type") == 0)
+ {
+ if (type >= 0)
+ {
+ printf("** Only 1 type value allowed\n");
+ return;
+ }
+ else
+ {
+ if (value[0] == '!')
+ {
+ type_not = TRUE;
+ offset = 1;
+ }
+
+ for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
+ {
+ if (strcmp(CS (value + offset), CS type_names[i]) == 0)
+ {
+ type = i/2;
+ break;
+ }
+ }
+ if (i >= sizeof(type_names)/sizeof(char *))
+ {
+ printf("** Unrecognized type name \"%s\"\n", value);
+ return;
+ }
+ }
+ }
+
+ else if (strcmp(CS name, "gbreak") == 0)
+ {
+ if (gbreak >= 0)
+ {
+ printf("** Only 1 grapheme break value allowed\n");
+ return;
+ }
+ else
+ {
+ if (value[0] == '!')
+ {
+ gbreak_not = TRUE;
+ offset = 1;
+ }
+
+ for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
+ {
+ if (strcmp(CS (value + offset), CS gb_names[i]) == 0)
+ {
+ gbreak = i/2;
+ break;
+ }
+ }
+ if (i >= sizeof(gb_names)/sizeof(char *))
+ {
+ printf("** Unrecognized gbreak name \"%s\"\n", value);
+ return;
+ }
+ }
+ }
+
+ else if (strcmp(CS name, "bidi") == 0 ||
+ strcmp(CS name, "bidiclass") == 0 ||
+ strcmp(CS name, "bidi_class") == 0 )
+ {
+ if (bidiclass >= 0)
+ {
+ printf("** Only 1 bidi class value allowed\n");
+ return;
+ }
+ else
+ {
+ if (value[0] == '!')
+ {
+ bidiclass_not = TRUE;
+ offset = 1;
+ }
+ for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
+ {
+ if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0)
+ {
+ bidiclass = i/2;
+ break;
+ }
+ }
+ if (i >= sizeof(bd_names)/sizeof(char *))
+ {
+ printf("** Unrecognized bidi class name \"%s\"\n", value);
+ return;
+ }
+ }
+ }
+
+ else
+ {
+ printf("** Unrecognized property name \"%s\"\n", name);
+ return;
+ }
+ }
+
+if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
+ gbreak < 0 && bidiclass < 0)
+ {
+ printf("** No properties specified\n");
+ return;
+ }
+
+for (c = 0; c <= 0x10ffff; c++)
+ {
+ if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
+
+ if (scriptx_count > 0)
+ {
+ const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
+ unsigned int found = 0;
+
+ for (i = 0; i < scriptx_count; i++)
+ {
+ int x = scriptx_list[i]/32;
+ int y = scriptx_list[i]%32;
+
+ /* Positive requirment */
+ if (scriptx_list[i] >= 0)
+ {
+ if ((bits_scriptx[x] & (1u<<y)) != 0) found++;
+ }
+ /* Negative requirement */
+ else
+ {
+ if ((bits_scriptx[x] & (1u<<y)) == 0) found++;
+ }
+ }
+
+ if (found != scriptx_count) continue;
+ }
+
+ if (bprop_count > 0)
+ {
+ const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) +
+ UCD_BPROPS(c) * ucd_boolprop_sets_item_size;
+ unsigned int found = 0;
+
+ for (i = 0; i < bprop_count; i++)
+ {
+ int x = bprop_list[i]/32;
+ int y = bprop_list[i]%32;
+
+ /* Positive requirement */
+ if (bprop_list[i] >= 0)
+ {
+ if ((bits_bprop[x] & (1u<<y)) != 0) found++;
+ }
+ /* Negative requirement */
+ else
+ {
+ if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
+ }
+ }
+
+ if (found != bprop_count) continue;
+ }
+
+ if (type >= 0)
+ {
+ if (type_not)
+ {
+ if (type == UCD_CHARTYPE(c)) continue;
+ }
+ else
+ {
+ if (type != UCD_CHARTYPE(c)) continue;
+ }
+ }
+
+ if (gbreak >= 0)
+ {
+ if (gbreak_not)
+ {
+ if (gbreak == UCD_GRAPHBREAK(c)) continue;
+ }
+ else
+ {
+ if (gbreak != UCD_GRAPHBREAK(c)) continue;
+ }
+ }
+
+ if (bidiclass >= 0)
+ {
+ if (bidiclass_not)
+ {
+ if (bidiclass == UCD_BIDICLASS(c)) continue;
+ }
+ else
+ {
+ if (bidiclass != UCD_BIDICLASS(c)) continue;
+ }
+ }
+
+ /* All conditions are met. Look for runs. */
+
+ ucd = GET_UCD(c);
+
+ for (i = c + 1; i < 0x10ffff; i++)
+ {
+ next_ucd = GET_UCD(i);
+ if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
+ }
+
+ if (--i > c)
+ {
+ printf("U+%04X..", c);
+ c = i;
+ hadrange = TRUE;
+ }
+ else if (hadrange) printf("%s", pad);
+
+ print_prop(c, FALSE);
+ if (c >= 0x100000) pad = " ";
+ else if (c >= 0x10000) pad = " ";
+ count++;
+ if (count >= 100)
+ {
+ printf("...\n");
+ break;
+ }
+ }
+
+if (count == 0) printf("No characters found\n");
+}
+
+
+/*************************************************
+* Process command line *
+*************************************************/
+
+static void
+process_command_line(unsigned char *buffer)
+{
+unsigned char *s, *t;
+unsigned char name[24];
+
+s = buffer;
+while (isspace(*s)) s++;
+if (*s == 0) return;
+
+for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
+*t = 0;
+while (isspace(*s)) s++;
+
+if (strcmp(CS name, "findprop") == 0)
+ {
+ while (*s != 0)
+ {
+ unsigned int c;
+ unsigned char *endptr;
+ t = s;
+
+ if (*t == '+')
+ {
+ c = *(++t);
+ if (c > 0x7fu)
+ {
+ GETCHARINC(c, t);
+ }
+ endptr = t+1;
+ }
+ else
+ {
+ if (strncmp(CS t, "U+", 2) == 0) t += 2;
+ c = strtoul(CS t, CSS(&endptr), 16);
+ }
+
+ if (*endptr != 0 && !isspace(*endptr))
+ {
+ while (*endptr != 0 && !isspace(*endptr)) endptr++;
+ printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
+ }
+ else
+ {
+ if (c > 0x10ffff)
+ printf("** U+%x is too big for a Unicode code point\n", c);
+ else
+ print_prop(c, TRUE);
+ }
+ s = endptr;
+ while (isspace(*s)) s++;
+ }
+ }
+
+else if (strcmp(CS name, "find") == 0)
+ {
+ find_chars(s);
+ }
+
+else if (strcmp(CS name, "list") == 0)
+ {
+ while (*s != 0)
+ {
+ size_t i;
+ for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
+ *t = 0;
+ while (isspace(*s)) s++;
+
+ if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
+ {
+ for (i = 0; i < PRIV(utt_size); i++)
+ if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
+ printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
+ }
+
+ else if (strcmp(CS name, "bool") == 0)
+ {
+ for (i = 0; i < PRIV(utt_size); i++)
+ if (PRIV(utt)[i].type == PT_BOOL)
+ printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
+ }
+
+ else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
+ {
+ for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
+ printf("%s %s\n", type_names[i], type_names[i+1]);
+ }
+
+ else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
+ {
+ for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
+ {
+ if (gb_names[i+1][0] != 0)
+ printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
+ else
+ printf("%s\n", gb_names[i]);
+ }
+ }
+
+ else if (strcmp(CS name, "bidi") == 0 ||
+ strcmp(CS name, "bidiclasses") == 0)
+ {
+ for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
+ printf("%3s %s\n", bd_names[i], bd_names[i+1]);
+ }
+
+ else
+ {
+ printf("** Unknown property \"%s\"\n", name);
+ break;
+ }
+ }
+ }
+
+else printf("** Unknown test command \"%s\"\n", name);
+}
+
+
+
+/*************************************************
+* Main program *
+*************************************************/
+
+int
+main(int argc, char **argv)
+{
+BOOL interactive;
+int first_arg = 1;
+unsigned char buffer[1024];
+
+if (argc > 1 && strcmp(argv[1], "-s") == 0)
+ {
+ show_character = TRUE;
+ first_arg++;
+ }
+
+if (argc > first_arg)
+ {
+ int i;
+ BOOL datafirst = TRUE;
+ char *arg = argv[first_arg];
+ unsigned char *s = buffer;
+
+ if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
+ {
+ while (*arg != 0)
+ {
+ if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
+ }
+ }
+
+ if (datafirst)
+ {
+ strcpy(CS s, "findprop ");
+ s += 9;
+ }
+
+ for (i = first_arg; i < argc; i++)
+ {
+ s += sprintf(CS s, "%s ", argv[i]);
+ }
+
+ process_command_line(buffer);
+ return 0;
+ }
+
+interactive = is_stdin_tty();
+
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+if (interactive) using_history();
+#endif
+
+for(;;)
+ {
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+ if (interactive)
+ {
+ size_t len;
+ unsigned char *s = US readline("> ");
+ if (s == NULL) break;
+ len = strlen(CS s);
+ if (len > 0) add_history(CS s);
+ memcpy(buffer, s, len);
+ buffer[len] = '\n';
+ buffer[len+1] = 0;
+ free(s);
+ }
+ else
+#endif
+
+ {
+ if (interactive) printf("> ");
+ if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
+ if (!interactive) printf("%s", buffer);
+ }
+
+ process_command_line(buffer);
+ }
+
+if (interactive) printf("\n");
+
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+if (interactive) clear_history();
+#endif
+
+return 0;
+}
+
+/* End */