diff options
author | Assaf Gordon <assafgordon@gmail.com> | 2017-01-19 04:49:15 (GMT) |
---|---|---|
committer | Assaf Gordon <assafgordon@gmail.com> | 2017-01-19 04:49:15 (GMT) |
commit | 273ae1da8d7d4e2447f6a0b9caaf76da83817282 (patch) | |
tree | 57122550f3951d5b7039c533c072ae39658d154e | |
parent | b408bcee1e81a2c4cc187d520b1fd5ae0da68f13 (diff) | |
download | grep-unicode-escape-2.zip grep-unicode-escape-2.tar.gz grep-unicode-escape-2.tar.bz2 |
grep: add support for \u, \U, \N escape sequencesunicode-escape-2
* src/unicode-escape.{c,h}: New module.
* src/Makefile.am: Include new module files.
* src/grep.c (convert_unicode_escape): New functiond to parse \u,\U,\N
sequences in the input RE patterns.
(unicode_escape_error): Call back for parsing errors, calls 'die'.
(main): Call 'convert_unicode_escape' when before adding a new pattern
with '-f FILE', '-e PAT' and argv[1].
-rw-r--r-- | src/Makefile.am | 5 | ||||
-rw-r--r-- | src/grep.c | 98 | ||||
-rw-r--r-- | src/unicode-escape.c | 250 | ||||
-rw-r--r-- | src/unicode-escape.h | 121 |
4 files changed, 471 insertions, 3 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index b7b02af..d6b0652 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -30,9 +30,10 @@ grep_SOURCES = \ kwsearch.c \ kwset.c \ pcresearch.c \ - searchutils.c + searchutils.c \ + unicode-escape.c -noinst_HEADERS = grep.h kwset.h search.h system.h +noinst_HEADERS = grep.h kwset.h search.h system.h unicode-escape.h # Sometimes, the expansion of $(LIBINTL) includes -lc which may # include modules defining variables like 'optind', so libgreputils.a @@ -46,6 +46,7 @@ #include "quote.h" #include "safe-read.h" #include "search.h" +#include "unicode-escape.h" #include "version-etc.h" #include "xalloc.h" #include "xstrtol.h" @@ -2431,6 +2432,85 @@ try_fgrep_pattern (int matcher, char *keys, size_t *len_p) return result; } +/* #define GREP_UNICODE_DEBUG */ + +/* Called from unicode-escape.c functions if + \u,\U,\N failed to parse */ +void +unicode_escape_error (const char* msg) +{ + die (EXIT_TROUBLE, 0, "%s", msg); +} + +/* Un-escape \u,\U,\N sequences. + 'pattern' does not need to be NUL terminated. It can contain multiple lines. + Content of 'pattern' is modified in-place. + new length is returned (always equal or shorter than 'len'). */ +static size_t +convert_unicode_escapes (char *pattern, const size_t len) +{ + char *p = pattern; /* input pointer */ + char *q = pattern; /* inplace output pointer */ + size_t plen = len; /* remaining octets in p */ + size_t qlen = len; /* available octets in q */ + char *tmp; + unsigned int uc = 0; + +#ifdef GREP_UNICODE_DEBUG + fprintf (stderr, "convert_unicode_patterns (len=%zu, pattern='%s')\n", + len, pattern); +#endif + + /* memchr should be more efficient than iterating each character, + if there are no backslashes, skip the whole conversion */ + if (memchr(p,'\\', len)==NULL) + return len; + + while (plen>0) + { + + if (*p != '\\' || plen==1) + { + /* Copy one character */ + *q++ = *p++; + --plen, --qlen; + continue; + } + + ++p, --plen; + + if ( *p != 'u' && *p != 'U' && *p != 'N' ) + { + /* Copy backslash+character */ + *q++ = '\\'; + *q++ = *p++; + --plen; + qlen -= 2; + continue; + } + + /* Parse unicode-escape sequence, store in 'uc' */ + tmp = (char*) parse_unicode_escape (&uc, p, plen); + plen -= (tmp-p); +#ifdef GREP_UNICODE_DEBUG + fprintf(stderr,"parsed unicode escape, delta = %ld, post-qlen=%zu\n", + (tmp-p), plen); +#endif + p = tmp; + + /* Convert to multibyte and store in q */ + tmp = store_unicode (uc, q, qlen); + qlen -= (tmp-q); +#ifdef GREP_UNICODE_DEBUG + fprintf(stderr,"stored unicode char, delta = %ld, post-qlen=%zu\n", + (tmp-q), qlen); +#endif + q = tmp; + } + + return len - qlen; /* length of in-place updated output string */ +} + int main (int argc, char **argv) { @@ -2579,6 +2659,9 @@ main (int argc, char **argv) } oldcc = keycc; memcpy (keys + oldcc, optarg, cc); + + cc = convert_unicode_escapes (keys + oldcc, cc); + keycc += cc; keys[keycc++] = '\n'; fl_add (keys + oldcc, cc + 1, ""); @@ -2605,7 +2688,17 @@ main (int argc, char **argv) /* Append final newline if file ended in non-newline. */ if (oldcc != keycc && keys[keycc - 1] != '\n') keys[keycc++] = '\n'; - fl_add (keys + oldcc, keycc - oldcc, optarg); + + { + /* Ugly hack: after un-escaping, the buffer's length might be + shorter, but 'keycc' already points to the end of the buffer. + We thus find the delta and adjust the pointer */ + const size_t oldlen = keycc - oldcc; + const size_t newlen = convert_unicode_escapes (keys + oldcc, oldlen); + fl_add (keys + oldcc, newlen, optarg); + keycc -= (oldlen-newlen); + } + break; case 'h': @@ -2799,6 +2892,9 @@ main (int argc, char **argv) /* Make a copy so that it can be reallocated or freed later. */ keycc = strlen (argv[optind]); keys = xmemdup (argv[optind++], keycc + 1); + + keycc = convert_unicode_escapes (keys, keycc); + fl_add (keys, keycc, ""); n_patterns++; } diff --git a/src/unicode-escape.c b/src/unicode-escape.c new file mode 100644 index 0000000..5fd27ad --- /dev/null +++ b/src/unicode-escape.c @@ -0,0 +1,250 @@ +#include <config.h> +#include <stdlib.h> +#include <errno.h> +#include <c-ctype.h> +#include <unistr.h> +#include <assert.h> + +#include "unicode-escape.h" + +/* #define UNICODE_ESCAPE_DEBUG */ + +#ifdef UNICODE_ESCAPE_DEBUG +#include <stdio.h> +#endif + +#define _(x) (x) + +/* Bound on the length of a Unicode character name. As of + Unicode 9.0.0 the maximum is 83, so this should be safe. */ +enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 }; + +/* Copied from coreutils' system.h */ +static inline unsigned char to_uchar (char ch) { return ch; } + +/* Copied form GNU coreutils' "printf.c:print_esc()" */ +#define hextobin(c) ((c) >= 'a' && (c) <= 'f' ? (c) - 'a' + 10 : \ + (c) >= 'A' && (c) <= 'F' ? (c) - 'A' + 10 : (c) - '0') + + +/* Parse named character escape sequences \N{x} */ +const char* +parse_named_character (unsigned int /*OUTPUT*/ *uc, + const char* buf, ptrdiff_t buflen) +{ + /* Lifted from Emacs' ./src/lread.c:read_escape() */ + unsigned char c; + bool whitespace = false; + char name[UNICODE_CHARACTER_NAME_LENGTH_BOUND + 1]; + ptrdiff_t length = 0; + + /* String doesn't start with 'N' - that's a programming error */ + assert (buflen > 0); + assert (*buf == 'N'); + + if (buflen<2) + { + unicode_escape_error (_("incomplete \\N{} sequence")); + return NULL; + } + + ++buf; + if (*buf++ != '{') + { + unicode_escape_error (_("missing '{' after \\N")); + return NULL; + } + + buflen -= 2; + + while (true) + { + if (--buflen<0) + { + unicode_escape_error (_("incomplete \\N{} sequence")); + return NULL; + } + + + c = to_uchar(*buf++); + if (!c) + { + unicode_escape_error (_("incomplete \\N{} sequence")); + return NULL; + } + + if (c == '}') + break; + + if (! (0 < c && c < 0x80)) + { + unicode_escape_error (_("invaid character in \\N escape sequence")); + return NULL; + } + + /* Treat multiple adjacent whitespace characters as a + single space character. This makes it easier to use + character names in e.g. multi-line strings. */ + if (c_isspace (c)) + { + if (whitespace) + continue; + c = ' '; + whitespace = true; + } + else + whitespace = false; + + name[length++] = c; + if (length >= sizeof name) + { + unicode_escape_error (_("character name too long in \\N sequence")); + return NULL; + } + } + + if (length == 0) + { + unicode_escape_error (_("Empty character name in \\N sequence")); + return NULL; + } + + name[length] = '\0'; + +#ifdef UNICODE_ESCAPE_DEBUG + fprintf (stderr,"parsed name character: \\N{%s}\n", name); +#endif + + /* Convert U+XXXX to a number */ + if (name[0] == 'U' && name[1] == '+') + { + /* For "U+XXXX", pass the leading '+' to string_to_number to reject + monstrosities like "U+-0000". */ + errno = 0; + char *ep; + unsigned long l = strtoul (name+1, &ep, 16); +#ifdef UNICODE_ESCAPE_DEBUG + fprintf (stderr,"parsed U+XXXX value: 0x%0lx\n", l); +#endif + if (errno != 0 || l == 0 || l > 0x10FFFF) + { + unicode_escape_error (_("Invalid value for \\N{U+XXXX} sequence")); + return NULL; + } + + if (uc) + *uc = l; + } + else + { + /* FIXME, use gnulib's 'uniname' module, store value in '*uc' */ + unicode_escape_error (_("Named conversion not implemented yet. " \ + "Please use \\N{U+XXXX}")); + return NULL; + } + + return buf; +} + + + +/* Convert Unicode escape sequences \uHHHH \UHHHHHHHH */ + +const char* +parse_unicode_codepoint (unsigned int *uc, + const char *buf, ptrdiff_t buflen) +{ + /* String doesn't start with 'u/U' - that's a programming error */ + assert (buflen>0); + assert (*buf == 'u' || *buf == 'U'); + + /* Copied form GNU coreutils' "printf.c:print_esc()" */ + const char *p = buf; + const char esc_char = *p; + unsigned int uni_value = 0 ; + int esc_length = (esc_char == 'u' ? 4 : 8); + + /* return the correct error for either 'U' or 'u'. + This will help the user know if 4 or 8 hexdigits + were expected. */ + const char *err = (esc_char == 'u') \ + ? _("missing hexdigits in \\uHHHH") + : _("missing hexdigits in \\UHHHHHHHH"); + + if (buflen < (esc_length+1)) + { + unicode_escape_error (err); + return NULL; + } + + for (++p; + esc_length > 0; + --esc_length, ++p) + { + if (! c_isxdigit (to_uchar (*p))) + { + unicode_escape_error (err); + return NULL; + } + + + uni_value = uni_value * 16 + hextobin (*p); + } + + if (esc_length>0) + { + unicode_escape_error (err); + return NULL; + } + + +#ifdef UNICODE_ESCAPE_DEBUG + fprintf(stderr,"Parsed \%c+%x value\n", esc_char, uni_value); +#endif + + if (uc) + *uc = uni_value; + + return p; +} + + +const char* +parse_unicode_escape (unsigned int /*OUTPUT*/ *uc, + const char *buf, ptrdiff_t buflen) +{ + assert ( buflen > 0 ); + const char c = *buf; + assert ( c == 'N' || c == 'u' || c == 'U' ); + + if (c=='N') + return parse_named_character (uc, buf, buflen); + else + return parse_unicode_codepoint (uc, buf, buflen); +} + + +char* +store_unicode (ucs4_t uc, char *buf, ptrdiff_t buflen) +{ + assert(buflen>=4); /* FIXME: allow smaller buffers? */ + + /* FIXME: convert ucs4_t to current locale, + instead of utf-8, possibly using gnulib's + unicodeio.c:unicode_to_mb() */ + int i = u8_uctomb ( (uint8_t*)buf, uc, buflen); + + if (i<1) + { + unicode_escape_error (_("invalid/forbidden unicode value")); + return NULL; + } + +#ifdef UNICODE_ESCAPE_DEBUG + fprintf(stderr,"store_uncode(uc = 0x%x) returned:\n", uc); + for (int j=0; j<i; ++j) + fprintf(stderr, " buf[%d]=0x%02x \n", j, to_uchar(buf[j])); +#endif + + return buf + (ptrdiff_t)i; /* TODO: vlaidate pointer arithmetic */ +} diff --git a/src/unicode-escape.h b/src/unicode-escape.h new file mode 100644 index 0000000..e089629 --- /dev/null +++ b/src/unicode-escape.h @@ -0,0 +1,121 @@ +/* Functions for prasing unicode escapes (\u, \U, \N). + Copyright (C) 2017 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + + +/* +Contrived Usage Example: + + // This function will be called on errors + void unicode_escape_error ( const char * err ) + { + fprintf (stderr, "unicode conversion failed: %s\n", err); + exit (1); + } + + char *p = "hello\N{U+03a8}world"; + char *ep = NULL; + unsigned int uc = 0; + + p = strchr (p, '\\')+1; + + // + // Step 1: Parse the string, extract the unicode code point + // + p = parse_unicode_escape (&uc, p, strlen(p)); + // uc == 0x03a8 + + // + // Step 2: convert unicode codepoint to multibyte string + // + char *outbuf = malloc (100); + char *op = outbuf; + op = store_unicode (uc, op, 100); + + + +All three parsing functions have the same interface: + parse_unicode_escape: Parse \N, \u, \U + parse_named_character: Parse only \N + parse_unicode_codepoint: Parse only \u, \U + +*/ + + +/* + */ +extern void +unicode_escape_error (const char* msg); + +/* Parse \N{...} string. + + buf should point to the 'N' character (past the backslash). + does not need to be NUL terminated. + + buflen is the maximum number of octets to read. + + 'uc' will contain the unicode point code. + + Return position after the parsed string, + or NULL if parsing error occured. +*/ +const char* +parse_named_character (unsigned int /*OUTPUT*/ *uc, + const char* buf, ptrdiff_t buflen); + + + +/* Parse \uHHHH and \UHHHHHHHH strings. + + buf should point to the 'u' (or 'U') character (past the backslash). + does not need to be NUL terminated. + + buflen is the maximum number of octets to read. + + 'uc' will contain the unicode point code. + + Return position after the parsed string, + or NULL if parsing error occured. +*/ +const char* +parse_unicode_codepoint(unsigned int /*OUTPUT*/ *result_uc, + const char *buf, ptrdiff_t buflen); + + +/* + Parses \N,\u,\U strings. + + buf should point to the N/u/U character (past the backslash). + does not need to be NUL terminated. + + buflen is the maximum number of octets to read. + + 'uc' will contain the unicode point code. + + Return position after the parsed string, + or NULL if parsing error occured. + + Calls either parse_named_character() or parse_unicode_codepoint() + depending on the first character in 'buf'. +*/ +const char* +parse_unicode_escape (unsigned int /*OUTPUT*/ *result_uc, + const char *buf, ptrdiff_t buflen); + + +char* +store_unicode (unsigned int uc, + char *buf, ptrdiff_t buflen); |