diff options
Diffstat (limited to 'maint/ucptest.c')
-rw-r--r-- | maint/ucptest.c | 1086 |
1 files changed, 1086 insertions, 0 deletions
diff --git a/maint/ucptest.c b/maint/ucptest.c new file mode 100644 index 00000000..6faead30 --- /dev/null +++ b/maint/ucptest.c @@ -0,0 +1,1086 @@ +/*************************************************** +* A program for testing the Unicode property table * +***************************************************/ + +/* Copyright (c) University of Cambridge 2008-2022 */ + +/* Compile thus: + + gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \ + ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c + + Add -lreadline or -ledit if PCRE2 was configured with readline or libedit + support in pcre2test. +*/ + +/* This is a hacked-up program for testing the Unicode properties tables of +PCRE2. It can also be used for finding characters with certain properties. I +wrote it to help with debugging, and have added things that I found useful, in +a rather haphazard way. The code has never been seriously tidied or checked for +robustness, but it shouldn't now give compiler warnings. + +There is only one option: "-s". If given, it applies only to the "findprop" +command. It causes the UTF-8 sequence of bytes that encode the character to be +output between angle brackets at the end of the line. On a UTF-8 terminal, this +will show the appropriate graphic for the code point. + +If the command has arguments, they are concatenated into a buffer, separated by +spaces. If the first argument starts "U+" or consists entirely of hexadecimal +digits, "findprop" is inserted at the start. The buffer is then processed as a +single line file, after which the program exits. If there are no arguments, the +program reads commands line by line on stdin and writes output to stdout. The +return code is always zero. + +There are three commands: + +The command "findprop" must be followed by a space-separated list of Unicode +code points as hex numbers, either without any prefix or starting with "U+", or +as individual UTF-8 characters preceded by '+'. For example: + + findprop U+1234 5Abc +? + +The output is one long line per character, listing Unicode properties that have +values, followed by its other case or cases if one or more exist, followed by +its Script Extension list if there is one. This list is in square brackets. A +second list in square brackets gives all the Boolean properties of the +character. The properties that come first are: + + Bidi class e.g. NSM (most common is L) + General type e.g. Letter + Specific type e.g. Upper case letter + Script e.g. Medefaidrin + Grapheme break type e.g. Extend (most common is Other) + +Script names and Boolean property names are all in lower case, with underscores +and hyphens removed, because that's how they are stored for "loose" matching. + +The command "find" must be followed by a list of property types and their +values. The values are case-sensitive, except for bidi class. This finds +characters that have those properties. If multiple properties are listed, they +must all be matched. Currently supported: + + script <name> The character must have this script property. Only one + such script may be given. + scriptx <name> This script must be in the character's Script Extension + property list. If this is used many times, all the given + scripts must be present. + type <abbrev> The character's specific type (e.g. Lu or Nd) must match. + gbreak <name> The grapheme break property must match. + bidi <class> The character's bidi class must match. + bool <name> The character's Boolean property list must contain this + property. + +If a <name> or <abbrev> is preceded by !, the value must NOT be present. For +Script Extensions and Boolean properties, there may be a mixture of positive +and negative requirements. All must be satisfied. + +Sequences of two or more characters are shown as ranges, for example +U+0041..U+004A. No more than 100 lines are are output. If there are more +characters, the list ends with ... + +The command "list" must be followed by one of property names script, bool, +type, gbreak or bidi. The defined values for that property are listed. */ + + +#ifdef HAVE_CONFIG_H +#include "../src/config.h" +#endif + +#ifndef SUPPORT_UNICODE +#define SUPPORT_UNICODE +#endif + +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "../src/pcre2_internal.h" +#include "../src/pcre2_ucp.h" + +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif + +#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) +#if defined(SUPPORT_LIBREADLINE) +#include <readline/readline.h> +#include <readline/history.h> +#else +#if defined(HAVE_EDITLINE_READLINE_H) +#include <editline/readline.h> +#else +#include <readline/readline.h> +#ifdef RL_VERSION_MAJOR +#include <readline/history.h> +#endif +#endif +#endif +#endif + + +/* -------------------------------------------------------------------*/ + +#define CS (char *) +#define CCS (const char *) +#define CSS (char **) +#define US (unsigned char *) +#define CUS (const unsigned char *) +#define USS (unsigned char **) + +/* -------------------------------------------------------------------*/ + +static BOOL show_character = FALSE; + +static const unsigned char *type_names[] = { + US"Cc", US"Control", + US"Cf", US"Format", + US"Cn", US"Unassigned", + US"Co", US"Private use", + US"Cs", US"Surrogate", + US"Ll", US"Lower case letter", + US"Lm", US"Modifier letter", + US"Lo", US"Other letter", + US"Lt", US"Title case letter", + US"Lu", US"Upper case letter", + US"Mc", US"Spacing mark", + US"Me", US"Enclosing mark", + US"Mn", US"Non-spacing mark", + US"Nd", US"Decimal number", + US"Nl", US"Letter number", + US"No", US"Other number", + US"Pc", US"Connector punctuation", + US"Pd", US"Dash punctuation", + US"Pe", US"Close punctuation", + US"Pf", US"Final punctuation", + US"Pi", US"Initial punctuation", + US"Po", US"Other punctuation", + US"Ps", US"Open punctuation", + US"Sc", US"Currency symbol", + US"Sk", US"Modifier symbol", + US"Sm", US"Mathematical symbol", + US"So", US"Other symbol", + US"Zl", US"Line separator", + US"Zp", US"Paragraph separator", + US"Zs", US"Space separator" +}; + +static const unsigned char *gb_names[] = { + US"CR", US"carriage return", + US"LF", US"linefeed", + US"Control", US"", + US"Extend", US"", + US"Prepend", US"", + US"SpacingMark", US"", + US"L", US"Hangul syllable type L", + US"V", US"Hangul syllable type V", + US"T", US"Hangul syllable type T", + US"LV", US"Hangul syllable type LV", + US"LVT", US"Hangul syllable type LVT", + US"Regional_Indicator", US"", + US"Other", US"", + US"ZWJ", US"zero width joiner", + US"Extended_Pictographic", US"" +}; + +static const unsigned char *bd_names[] = { + US"AL", US"Arabic letter", + US"AN", US"Arabid number", + US"B", US"Paragraph separator", + US"BN", US"Boundary neutral", + US"CS", US"Common separator", + US"EN", US"European number", + US"ES", US"European separator", + US"ET", US"European terminator", + US"FSI", US"First string isolate", + US"L", US"Left-to-right", + US"LRE", US"Left-to-right embedding", + US"LRI", US"Left-to-right isolate", + US"LRO", US"Left-to-right override", + US"NSM", US"Non-spacing mark", + US"ON", US"Other neutral", + US"PDF", US"Pop directional format", + US"PDI", US"Pop directional isolate", + US"R", US"Right-to-left", + US"RLE", US"Right-to-left embedding", + US"RLI", US"Right-to-left isolate", + US"RLO", US"Right-to-left override", + US"S", US"Segment separator", + US"WS", US"White space" +}; + +static const unsigned int utf8_table1[] = { + 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff}; + +static const int utf8_table2[] = { + 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; + +/* Macro to pick up the remaining bytes of a UTF-8 character, advancing +the pointer. */ + +#define GETUTF8INC(c, eptr) \ + { \ + if ((c & 0x20u) == 0) \ + c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \ + else if ((c & 0x10u) == 0) \ + { \ + c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \ + eptr += 2; \ + } \ + else if ((c & 0x08u) == 0) \ + { \ + c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \ + ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ + eptr += 3; \ + } \ + else if ((c & 0x04u) == 0) \ + { \ + c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \ + ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \ + (eptr[3] & 0x3fu); \ + eptr += 4; \ + } \ + else \ + { \ + c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \ + ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \ + ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \ + eptr += 5; \ + } \ + } + + + +/************************************************* +* Convert character value to UTF-8 * +*************************************************/ + +/* This function takes an unsigned long integer value in the range 0 - +0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes. + +Arguments: + cvalue the character value + buffer pointer to buffer for result - at least 6 bytes long + +Returns: number of bytes placed in the buffer + 0 if input code point is too big +*/ + +static size_t +ord2utf8(unsigned int cvalue, unsigned char *buffer) +{ +size_t i, j; +for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) + if (cvalue <= utf8_table1[i]) break; +if (i >= sizeof(utf8_table1)/sizeof(int)) return 0; +buffer += i; +for (j = i; j > 0; j--) + { + *buffer-- = 0x80 | (cvalue & 0x3f); + cvalue >>= 6; + } +*buffer = utf8_table2[i] | cvalue; +return i + 1; +} + + + +/************************************************* +* Test for interaction * +*************************************************/ + +static BOOL +is_stdin_tty(void) +{ +#if defined WIN32 +return _isatty(_fileno(stdin)); +#else +return isatty(fileno(stdin)); +#endif +} + + +/************************************************* +* Get name from ucp ident * +*************************************************/ + +/* The utt table contains both full names and abbreviations. So search for both +and use the longer if two are found, unless the first one is only 3 characters +and we are looking for a script (some scripts have 3-character names). If this +were not just a test program it might be worth making some kind of reverse +index. */ + +static const char * +get_propname(int prop, int type) +{ +size_t i, j, len; +size_t foundlist[2]; +const char *yield; +int typex = (type == PT_SC)? PT_SCX : type; + +j = 0; +for (i = 0; i < PRIV(utt_size); i++) + { + const ucp_type_table *u = PRIV(utt) + i; + if ((u->type == type || u->type == typex) && u->value == prop) + { + foundlist[j++] = i; + if (j >= 2) break; + } + } + +if (j == 0) return "??"; + +yield = NULL; +len = 0; + +for (i = 0; i < j; i++) + { + const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset; + size_t sl = strlen(s); + + if (sl > len) + { + yield = s; + if (sl == 3 && type == PT_SC) break; + len = sl; + } + } + +return yield; +} + + +/************************************************* +* Print Unicode property info for a char * +*************************************************/ + +static void +print_prop(unsigned int c, BOOL is_just_one) +{ +int type = UCD_CATEGORY(c); +int fulltype = UCD_CHARTYPE(c); +int script = UCD_SCRIPT(c); +int scriptx = UCD_SCRIPTX(c); +int gbprop = UCD_GRAPHBREAK(c); +int bidi = UCD_BIDICLASS(c); +unsigned int othercase = UCD_OTHERCASE(c); +int caseset = UCD_CASESET(c); +int bprops = UCD_BPROPS(c); + +const unsigned char *fulltypename = US"??"; +const unsigned char *typename = US"??"; +const unsigned char *graphbreak = US"??"; +const unsigned char *bidiclass = US"??"; +const unsigned char *scriptname = CUS get_propname(script, PT_SC); + +switch (type) + { + case ucp_C: typename = US"Control"; break; + case ucp_L: typename = US"Letter"; break; + case ucp_M: typename = US"Mark"; break; + case ucp_N: typename = US"Number"; break; + case ucp_P: typename = US"Punctuation"; break; + case ucp_S: typename = US"Symbol"; break; + case ucp_Z: typename = US"Separator"; break; + } + +switch (fulltype) + { + case ucp_Cc: fulltypename = US"Control"; break; + case ucp_Cf: fulltypename = US"Format"; break; + case ucp_Cn: fulltypename = US"Unassigned"; break; + case ucp_Co: fulltypename = US"Private use"; break; + case ucp_Cs: fulltypename = US"Surrogate"; break; + case ucp_Ll: fulltypename = US"Lower case letter"; break; + case ucp_Lm: fulltypename = US"Modifier letter"; break; + case ucp_Lo: fulltypename = US"Other letter"; break; + case ucp_Lt: fulltypename = US"Title case letter"; break; + case ucp_Lu: fulltypename = US"Upper case letter"; break; + case ucp_Mc: fulltypename = US"Spacing mark"; break; + case ucp_Me: fulltypename = US"Enclosing mark"; break; + case ucp_Mn: fulltypename = US"Non-spacing mark"; break; + case ucp_Nd: fulltypename = US"Decimal number"; break; + case ucp_Nl: fulltypename = US"Letter number"; break; + case ucp_No: fulltypename = US"Other number"; break; + case ucp_Pc: fulltypename = US"Connector punctuation"; break; + case ucp_Pd: fulltypename = US"Dash punctuation"; break; + case ucp_Pe: fulltypename = US"Close punctuation"; break; + case ucp_Pf: fulltypename = US"Final punctuation"; break; + case ucp_Pi: fulltypename = US"Initial punctuation"; break; + case ucp_Po: fulltypename = US"Other punctuation"; break; + case ucp_Ps: fulltypename = US"Open punctuation"; break; + case ucp_Sc: fulltypename = US"Currency symbol"; break; + case ucp_Sk: fulltypename = US"Modifier symbol"; break; + case ucp_Sm: fulltypename = US"Mathematical symbol"; break; + case ucp_So: fulltypename = US"Other symbol"; break; + case ucp_Zl: fulltypename = US"Line separator"; break; + case ucp_Zp: fulltypename = US"Paragraph separator"; break; + case ucp_Zs: fulltypename = US"Space separator"; break; + } + +switch(gbprop) + { + case ucp_gbCR: graphbreak = US"CR"; break; + case ucp_gbLF: graphbreak = US"LF"; break; + case ucp_gbControl: graphbreak = US"Control"; break; + case ucp_gbExtend: graphbreak = US"Extend"; break; + case ucp_gbPrepend: graphbreak = US"Prepend"; break; + case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break; + case ucp_gbL: graphbreak = US"Hangul syllable type L"; break; + case ucp_gbV: graphbreak = US"Hangul syllable type V"; break; + case ucp_gbT: graphbreak = US"Hangul syllable type T"; break; + case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break; + case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break; + case ucp_gbRegional_Indicator: + graphbreak = US"Regional Indicator"; break; + case ucp_gbOther: graphbreak = US"Other"; break; + case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break; + case ucp_gbExtended_Pictographic: + graphbreak = US"Extended Pictographic"; break; + default: graphbreak = US"Unknown"; break; + } + +switch(bidi) + { + case ucp_bidiAL: bidiclass = US"AL "; break; + case ucp_bidiFSI: bidiclass = US"FSI"; break; + case ucp_bidiL: bidiclass = US"L "; break; + case ucp_bidiLRE: bidiclass = US"LRE"; break; + case ucp_bidiLRI: bidiclass = US"LRI"; break; + case ucp_bidiLRO: bidiclass = US"LRO"; break; + case ucp_bidiPDF: bidiclass = US"PDF"; break; + case ucp_bidiPDI: bidiclass = US"PDI"; break; + case ucp_bidiR: bidiclass = US"R "; break; + case ucp_bidiRLE: bidiclass = US"RLE"; break; + case ucp_bidiRLI: bidiclass = US"RLI"; break; + case ucp_bidiRLO: bidiclass = US"RLO"; break; + case ucp_bidiAN: bidiclass = US"AN "; break; + case ucp_bidiB: bidiclass = US"B "; break; + case ucp_bidiBN: bidiclass = US"BN "; break; + case ucp_bidiCS: bidiclass = US"CS "; break; + case ucp_bidiEN: bidiclass = US"EN "; break; + case ucp_bidiES: bidiclass = US"ES "; break; + case ucp_bidiET: bidiclass = US"ET "; break; + case ucp_bidiNSM: bidiclass = US"NSM"; break; + case ucp_bidiON: bidiclass = US"ON "; break; + case ucp_bidiS: bidiclass = US"S "; break; + case ucp_bidiWS: bidiclass = US"WS "; break; + default: bidiclass = US"???"; break; + } + +printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename, + scriptname, graphbreak); + +if (is_just_one && othercase != c) + { + printf(", U+%04X", othercase); + if (caseset != 0) + { + const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1; + while (*(++p) < NOTACHAR) + { + unsigned int d = *p; + if (d != othercase && d != c) printf(", U+%04X", d); + } + } + } + +if (scriptx != 0) + { + const char *sep = ""; + const uint32_t *p = PRIV(ucd_script_sets) + scriptx; + printf(", ["); + for (int i = 0; i < ucp_Unknown; i++) + if (MAPBIT(p, i) != 0) + { + printf("%s%s", sep, get_propname(i, PT_SC)); + sep = ", "; + } + printf("]"); + } + +if (bprops != 0) + { + const char *sep = ""; + const uint32_t *p = PRIV(ucd_boolprop_sets) + + bprops * ucd_boolprop_sets_item_size; + printf(", ["); + for (int i = 0; i < ucp_Bprop_Count; i++) + if (MAPBIT(p, i) != 0) + { + printf("%s%s", sep, get_propname(i, PT_BOOL)); + sep = ", "; + } + printf("]"); + } + +if (show_character && is_just_one) + { + unsigned char buffer[8]; + size_t len = ord2utf8(c, buffer); + printf(", >%.*s<", (int)len, buffer); + } + +printf("\n"); +} + + + +/************************************************* +* Find character(s) with given property/ies * +*************************************************/ + +static void +find_chars(unsigned char *s) +{ +unsigned char name[128]; +unsigned char value[128]; +unsigned char *t; +unsigned int count= 0; +int scriptx_list[128]; +unsigned int scriptx_count = 0; +int bprop_list[128]; +unsigned int bprop_count = 0; +uint32_t i, c; +int script = -1; +int type = -1; +int gbreak = -1; +int bidiclass = -1; +BOOL script_not = FALSE; +BOOL type_not = FALSE; +BOOL gbreak_not = FALSE; +BOOL bidiclass_not = FALSE; +BOOL hadrange = FALSE; +const ucd_record *ucd, *next_ucd; +const char *pad = " "; + +while (*s != 0) + { + unsigned int offset = 0; + BOOL scriptx_not = FALSE; + + for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s; + *t = 0; + while (isspace(*s)) s++; + + for (t = value; *s != 0 && !isspace(*s); s++) + { + if (*s != '_' && *s != '-') *t++ = *s; + } + *t = 0; + while (isspace(*s)) s++; + + if (strcmp(CS name, "script") == 0 || + strcmp(CS name, "scriptx") == 0) + { + for (t = value; *t != 0; t++) *t = tolower(*t); + + if (value[0] == '!') + { + if (name[6] == 'x') scriptx_not = TRUE; + else script_not = TRUE; + offset = 1; + } + + for (i = 0; i < PRIV(utt_size); i++) + { + const ucp_type_table *u = PRIV(utt) + i; + if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset), + PRIV(utt_names) + u->name_offset) == 0) + { + c = u->value; + if (name[6] == 'x') + { + scriptx_list[scriptx_count++] = scriptx_not? (-c):c; + } + else + { + if (script < 0) script = c; else + { + printf("** Only 1 script value allowed\n"); + return; + } + } + break; + } + } + + if (i >= PRIV(utt_size)) + { + printf("** Unrecognized script name \"%s\"\n", value); + return; + } + } + + else if (strcmp(CS name, "bool") == 0) + { + int not = 1; + if (value[0] == '!') + { + not = -1; + offset = 1; + } + + for (i = 0; i < PRIV(utt_size); i++) + { + const ucp_type_table *u = PRIV(utt) + i; + if (u->type == PT_BOOL && strcmp(CS(value + offset), + PRIV(utt_names) + u->name_offset) == 0) + { + bprop_list[bprop_count++] = u->value * not; + break; + } + } + + if (i >= PRIV(utt_size)) + { + printf("** Unrecognized property name \"%s\"\n", value); + return; + } + } + + else if (strcmp(CS name, "type") == 0) + { + if (type >= 0) + { + printf("** Only 1 type value allowed\n"); + return; + } + else + { + if (value[0] == '!') + { + type_not = TRUE; + offset = 1; + } + + for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2) + { + if (strcmp(CS (value + offset), CS type_names[i]) == 0) + { + type = i/2; + break; + } + } + if (i >= sizeof(type_names)/sizeof(char *)) + { + printf("** Unrecognized type name \"%s\"\n", value); + return; + } + } + } + + else if (strcmp(CS name, "gbreak") == 0) + { + if (gbreak >= 0) + { + printf("** Only 1 grapheme break value allowed\n"); + return; + } + else + { + if (value[0] == '!') + { + gbreak_not = TRUE; + offset = 1; + } + + for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2) + { + if (strcmp(CS (value + offset), CS gb_names[i]) == 0) + { + gbreak = i/2; + break; + } + } + if (i >= sizeof(gb_names)/sizeof(char *)) + { + printf("** Unrecognized gbreak name \"%s\"\n", value); + return; + } + } + } + + else if (strcmp(CS name, "bidi") == 0 || + strcmp(CS name, "bidiclass") == 0 || + strcmp(CS name, "bidi_class") == 0 ) + { + if (bidiclass >= 0) + { + printf("** Only 1 bidi class value allowed\n"); + return; + } + else + { + if (value[0] == '!') + { + bidiclass_not = TRUE; + offset = 1; + } + for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2) + { + if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0) + { + bidiclass = i/2; + break; + } + } + if (i >= sizeof(bd_names)/sizeof(char *)) + { + printf("** Unrecognized bidi class name \"%s\"\n", value); + return; + } + } + } + + else + { + printf("** Unrecognized property name \"%s\"\n", name); + return; + } + } + +if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 && + gbreak < 0 && bidiclass < 0) + { + printf("** No properties specified\n"); + return; + } + +for (c = 0; c <= 0x10ffff; c++) + { + if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue; + + if (scriptx_count > 0) + { + const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c); + unsigned int found = 0; + + for (i = 0; i < scriptx_count; i++) + { + int x = scriptx_list[i]/32; + int y = scriptx_list[i]%32; + + /* Positive requirment */ + if (scriptx_list[i] >= 0) + { + if ((bits_scriptx[x] & (1u<<y)) != 0) found++; + } + /* Negative requirement */ + else + { + if ((bits_scriptx[x] & (1u<<y)) == 0) found++; + } + } + + if (found != scriptx_count) continue; + } + + if (bprop_count > 0) + { + const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) + + UCD_BPROPS(c) * ucd_boolprop_sets_item_size; + unsigned int found = 0; + + for (i = 0; i < bprop_count; i++) + { + int x = bprop_list[i]/32; + int y = bprop_list[i]%32; + + /* Positive requirement */ + if (bprop_list[i] >= 0) + { + if ((bits_bprop[x] & (1u<<y)) != 0) found++; + } + /* Negative requirement */ + else + { + if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++; + } + } + + if (found != bprop_count) continue; + } + + if (type >= 0) + { + if (type_not) + { + if (type == UCD_CHARTYPE(c)) continue; + } + else + { + if (type != UCD_CHARTYPE(c)) continue; + } + } + + if (gbreak >= 0) + { + if (gbreak_not) + { + if (gbreak == UCD_GRAPHBREAK(c)) continue; + } + else + { + if (gbreak != UCD_GRAPHBREAK(c)) continue; + } + } + + if (bidiclass >= 0) + { + if (bidiclass_not) + { + if (bidiclass == UCD_BIDICLASS(c)) continue; + } + else + { + if (bidiclass != UCD_BIDICLASS(c)) continue; + } + } + + /* All conditions are met. Look for runs. */ + + ucd = GET_UCD(c); + + for (i = c + 1; i < 0x10ffff; i++) + { + next_ucd = GET_UCD(i); + if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break; + } + + if (--i > c) + { + printf("U+%04X..", c); + c = i; + hadrange = TRUE; + } + else if (hadrange) printf("%s", pad); + + print_prop(c, FALSE); + if (c >= 0x100000) pad = " "; + else if (c >= 0x10000) pad = " "; + count++; + if (count >= 100) + { + printf("...\n"); + break; + } + } + +if (count == 0) printf("No characters found\n"); +} + + +/************************************************* +* Process command line * +*************************************************/ + +static void +process_command_line(unsigned char *buffer) +{ +unsigned char *s, *t; +unsigned char name[24]; + +s = buffer; +while (isspace(*s)) s++; +if (*s == 0) return; + +for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s; +*t = 0; +while (isspace(*s)) s++; + +if (strcmp(CS name, "findprop") == 0) + { + while (*s != 0) + { + unsigned int c; + unsigned char *endptr; + t = s; + + if (*t == '+') + { + c = *(++t); + if (c > 0x7fu) + { + GETCHARINC(c, t); + } + endptr = t+1; + } + else + { + if (strncmp(CS t, "U+", 2) == 0) t += 2; + c = strtoul(CS t, CSS(&endptr), 16); + } + + if (*endptr != 0 && !isspace(*endptr)) + { + while (*endptr != 0 && !isspace(*endptr)) endptr++; + printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s); + } + else + { + if (c > 0x10ffff) + printf("** U+%x is too big for a Unicode code point\n", c); + else + print_prop(c, TRUE); + } + s = endptr; + while (isspace(*s)) s++; + } + } + +else if (strcmp(CS name, "find") == 0) + { + find_chars(s); + } + +else if (strcmp(CS name, "list") == 0) + { + while (*s != 0) + { + size_t i; + for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s; + *t = 0; + while (isspace(*s)) s++; + + if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0) + { + for (i = 0; i < PRIV(utt_size); i++) + if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC) + printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset); + } + + else if (strcmp(CS name, "bool") == 0) + { + for (i = 0; i < PRIV(utt_size); i++) + if (PRIV(utt)[i].type == PT_BOOL) + printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset); + } + + else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0) + { + for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2) + printf("%s %s\n", type_names[i], type_names[i+1]); + } + + else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0) + { + for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2) + { + if (gb_names[i+1][0] != 0) + printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]); + else + printf("%s\n", gb_names[i]); + } + } + + else if (strcmp(CS name, "bidi") == 0 || + strcmp(CS name, "bidiclasses") == 0) + { + for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2) + printf("%3s %s\n", bd_names[i], bd_names[i+1]); + } + + else + { + printf("** Unknown property \"%s\"\n", name); + break; + } + } + } + +else printf("** Unknown test command \"%s\"\n", name); +} + + + +/************************************************* +* Main program * +*************************************************/ + +int +main(int argc, char **argv) +{ +BOOL interactive; +int first_arg = 1; +unsigned char buffer[1024]; + +if (argc > 1 && strcmp(argv[1], "-s") == 0) + { + show_character = TRUE; + first_arg++; + } + +if (argc > first_arg) + { + int i; + BOOL datafirst = TRUE; + char *arg = argv[first_arg]; + unsigned char *s = buffer; + + if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg)) + { + while (*arg != 0) + { + if (!isxdigit(*arg++)) { datafirst = FALSE; break; } + } + } + + if (datafirst) + { + strcpy(CS s, "findprop "); + s += 9; + } + + for (i = first_arg; i < argc; i++) + { + s += sprintf(CS s, "%s ", argv[i]); + } + + process_command_line(buffer); + return 0; + } + +interactive = is_stdin_tty(); + +#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) +if (interactive) using_history(); +#endif + +for(;;) + { +#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) + if (interactive) + { + size_t len; + unsigned char *s = US readline("> "); + if (s == NULL) break; + len = strlen(CS s); + if (len > 0) add_history(CS s); + memcpy(buffer, s, len); + buffer[len] = '\n'; + buffer[len+1] = 0; + free(s); + } + else +#endif + + { + if (interactive) printf("> "); + if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break; + if (!interactive) printf("%s", buffer); + } + + process_command_line(buffer); + } + +if (interactive) printf("\n"); + +#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) +if (interactive) clear_history(); +#endif + +return 0; +} + +/* End */ |