summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAssaf Gordon <assafgordon@gmail.com>2017-01-15 06:11:43 (GMT)
committerAssaf Gordon <assafgordon@gmail.com>2017-01-15 06:11:43 (GMT)
commita886428effefdae90a9de721b9736f3f814dc766 (patch)
tree224b6ee3fac06a0cb277080bc9a885e03ebc3713
parent3438c3a65c655baed1bb764e41d7ddcced5f1e7c (diff)
downloadgrep-unicode-escape.zip
grep-unicode-escape.tar.gz
grep-unicode-escape.tar.bz2
grep: unicode-escape-moduleunicode-escape
-rw-r--r--src/Makefile.am5
-rw-r--r--src/grep.c24
-rw-r--r--src/unicode-escape.c250
-rw-r--r--src/unicode-escape.h121
4 files changed, 398 insertions, 2 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index b7b02af..d6b0652 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -30,9 +30,10 @@ grep_SOURCES = \
kwsearch.c \
kwset.c \
pcresearch.c \
- searchutils.c
+ searchutils.c \
+ unicode-escape.c
-noinst_HEADERS = grep.h kwset.h search.h system.h
+noinst_HEADERS = grep.h kwset.h search.h system.h unicode-escape.h
# Sometimes, the expansion of $(LIBINTL) includes -lc which may
# include modules defining variables like 'optind', so libgreputils.a
diff --git a/src/grep.c b/src/grep.c
index 0a674ec..a95bc9e 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -46,6 +46,7 @@
#include "quote.h"
#include "safe-read.h"
#include "search.h"
+#include "unicode-escape.h"
#include "version-etc.h"
#include "xalloc.h"
#include "xstrtol.h"
@@ -178,6 +179,16 @@ static void clear_asan_poison (void) { }
static void asan_poison (void const volatile *addr, size_t size) { }
#endif
+
+/* Called from unicode-escape.c functions if
+ \u,\U,\N failed to parse */
+void
+unicode_escape_error (const char* msg)
+{
+ die (EXIT_TROUBLE, 0, "%s", msg);
+}
+
+
/* The group separator used when context is requested. */
static const char *group_separator = SEP_STR_GROUP;
@@ -2421,6 +2432,16 @@ try_fgrep_pattern (int matcher, char *keys, size_t *len_p)
return result;
}
+void
+convert_unicode_patterns (char *keys)
+{
+ fprintf (stderr, "n_patterns = %zu\n", n_patterns);
+ for (size_t i=0;i<n_patterns;++i)
+ {
+ fprintf (stderr, "pattern[%zu] = '%s'\n", i, keys[i];
+ }
+}
+
int
main (int argc, char **argv)
{
@@ -2571,6 +2592,7 @@ main (int argc, char **argv)
memcpy (keys + oldcc, optarg, cc);
keycc += cc;
keys[keycc++] = '\n';
+ /* agn: de-unicode here */
fl_add (keys + oldcc, cc + 1, "");
break;
@@ -2595,6 +2617,7 @@ main (int argc, char **argv)
/* Append final newline if file ended in non-newline. */
if (oldcc != keycc && keys[keycc - 1] != '\n')
keys[keycc++] = '\n';
+ /* agn: de-unicode here */
fl_add (keys + oldcc, keycc - oldcc, optarg);
break;
@@ -2789,6 +2812,7 @@ main (int argc, char **argv)
/* Make a copy so that it can be reallocated or freed later. */
keycc = strlen (argv[optind]);
keys = xmemdup (argv[optind++], keycc + 1);
+ /* agn: de-unicode here */
fl_add (keys, keycc, "");
n_patterns++;
}
diff --git a/src/unicode-escape.c b/src/unicode-escape.c
new file mode 100644
index 0000000..5fd27ad
--- /dev/null
+++ b/src/unicode-escape.c
@@ -0,0 +1,250 @@
+#include <config.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <c-ctype.h>
+#include <unistr.h>
+#include <assert.h>
+
+#include "unicode-escape.h"
+
+/* #define UNICODE_ESCAPE_DEBUG */
+
+#ifdef UNICODE_ESCAPE_DEBUG
+#include <stdio.h>
+#endif
+
+#define _(x) (x)
+
+/* Bound on the length of a Unicode character name. As of
+ Unicode 9.0.0 the maximum is 83, so this should be safe. */
+enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
+
+/* Copied from coreutils' system.h */
+static inline unsigned char to_uchar (char ch) { return ch; }
+
+/* Copied form GNU coreutils' "printf.c:print_esc()" */
+#define hextobin(c) ((c) >= 'a' && (c) <= 'f' ? (c) - 'a' + 10 : \
+ (c) >= 'A' && (c) <= 'F' ? (c) - 'A' + 10 : (c) - '0')
+
+
+/* Parse named character escape sequences \N{x} */
+const char*
+parse_named_character (unsigned int /*OUTPUT*/ *uc,
+ const char* buf, ptrdiff_t buflen)
+{
+ /* Lifted from Emacs' ./src/lread.c:read_escape() */
+ unsigned char c;
+ bool whitespace = false;
+ char name[UNICODE_CHARACTER_NAME_LENGTH_BOUND + 1];
+ ptrdiff_t length = 0;
+
+ /* String doesn't start with 'N' - that's a programming error */
+ assert (buflen > 0);
+ assert (*buf == 'N');
+
+ if (buflen<2)
+ {
+ unicode_escape_error (_("incomplete \\N{} sequence"));
+ return NULL;
+ }
+
+ ++buf;
+ if (*buf++ != '{')
+ {
+ unicode_escape_error (_("missing '{' after \\N"));
+ return NULL;
+ }
+
+ buflen -= 2;
+
+ while (true)
+ {
+ if (--buflen<0)
+ {
+ unicode_escape_error (_("incomplete \\N{} sequence"));
+ return NULL;
+ }
+
+
+ c = to_uchar(*buf++);
+ if (!c)
+ {
+ unicode_escape_error (_("incomplete \\N{} sequence"));
+ return NULL;
+ }
+
+ if (c == '}')
+ break;
+
+ if (! (0 < c && c < 0x80))
+ {
+ unicode_escape_error (_("invaid character in \\N escape sequence"));
+ return NULL;
+ }
+
+ /* Treat multiple adjacent whitespace characters as a
+ single space character. This makes it easier to use
+ character names in e.g. multi-line strings. */
+ if (c_isspace (c))
+ {
+ if (whitespace)
+ continue;
+ c = ' ';
+ whitespace = true;
+ }
+ else
+ whitespace = false;
+
+ name[length++] = c;
+ if (length >= sizeof name)
+ {
+ unicode_escape_error (_("character name too long in \\N sequence"));
+ return NULL;
+ }
+ }
+
+ if (length == 0)
+ {
+ unicode_escape_error (_("Empty character name in \\N sequence"));
+ return NULL;
+ }
+
+ name[length] = '\0';
+
+#ifdef UNICODE_ESCAPE_DEBUG
+ fprintf (stderr,"parsed name character: \\N{%s}\n", name);
+#endif
+
+ /* Convert U+XXXX to a number */
+ if (name[0] == 'U' && name[1] == '+')
+ {
+ /* For "U+XXXX", pass the leading '+' to string_to_number to reject
+ monstrosities like "U+-0000". */
+ errno = 0;
+ char *ep;
+ unsigned long l = strtoul (name+1, &ep, 16);
+#ifdef UNICODE_ESCAPE_DEBUG
+ fprintf (stderr,"parsed U+XXXX value: 0x%0lx\n", l);
+#endif
+ if (errno != 0 || l == 0 || l > 0x10FFFF)
+ {
+ unicode_escape_error (_("Invalid value for \\N{U+XXXX} sequence"));
+ return NULL;
+ }
+
+ if (uc)
+ *uc = l;
+ }
+ else
+ {
+ /* FIXME, use gnulib's 'uniname' module, store value in '*uc' */
+ unicode_escape_error (_("Named conversion not implemented yet. " \
+ "Please use \\N{U+XXXX}"));
+ return NULL;
+ }
+
+ return buf;
+}
+
+
+
+/* Convert Unicode escape sequences \uHHHH \UHHHHHHHH */
+
+const char*
+parse_unicode_codepoint (unsigned int *uc,
+ const char *buf, ptrdiff_t buflen)
+{
+ /* String doesn't start with 'u/U' - that's a programming error */
+ assert (buflen>0);
+ assert (*buf == 'u' || *buf == 'U');
+
+ /* Copied form GNU coreutils' "printf.c:print_esc()" */
+ const char *p = buf;
+ const char esc_char = *p;
+ unsigned int uni_value = 0 ;
+ int esc_length = (esc_char == 'u' ? 4 : 8);
+
+ /* return the correct error for either 'U' or 'u'.
+ This will help the user know if 4 or 8 hexdigits
+ were expected. */
+ const char *err = (esc_char == 'u') \
+ ? _("missing hexdigits in \\uHHHH")
+ : _("missing hexdigits in \\UHHHHHHHH");
+
+ if (buflen < (esc_length+1))
+ {
+ unicode_escape_error (err);
+ return NULL;
+ }
+
+ for (++p;
+ esc_length > 0;
+ --esc_length, ++p)
+ {
+ if (! c_isxdigit (to_uchar (*p)))
+ {
+ unicode_escape_error (err);
+ return NULL;
+ }
+
+
+ uni_value = uni_value * 16 + hextobin (*p);
+ }
+
+ if (esc_length>0)
+ {
+ unicode_escape_error (err);
+ return NULL;
+ }
+
+
+#ifdef UNICODE_ESCAPE_DEBUG
+ fprintf(stderr,"Parsed \%c+%x value\n", esc_char, uni_value);
+#endif
+
+ if (uc)
+ *uc = uni_value;
+
+ return p;
+}
+
+
+const char*
+parse_unicode_escape (unsigned int /*OUTPUT*/ *uc,
+ const char *buf, ptrdiff_t buflen)
+{
+ assert ( buflen > 0 );
+ const char c = *buf;
+ assert ( c == 'N' || c == 'u' || c == 'U' );
+
+ if (c=='N')
+ return parse_named_character (uc, buf, buflen);
+ else
+ return parse_unicode_codepoint (uc, buf, buflen);
+}
+
+
+char*
+store_unicode (ucs4_t uc, char *buf, ptrdiff_t buflen)
+{
+ assert(buflen>=4); /* FIXME: allow smaller buffers? */
+
+ /* FIXME: convert ucs4_t to current locale,
+ instead of utf-8, possibly using gnulib's
+ unicodeio.c:unicode_to_mb() */
+ int i = u8_uctomb ( (uint8_t*)buf, uc, buflen);
+
+ if (i<1)
+ {
+ unicode_escape_error (_("invalid/forbidden unicode value"));
+ return NULL;
+ }
+
+#ifdef UNICODE_ESCAPE_DEBUG
+ fprintf(stderr,"store_uncode(uc = 0x%x) returned:\n", uc);
+ for (int j=0; j<i; ++j)
+ fprintf(stderr, " buf[%d]=0x%02x \n", j, to_uchar(buf[j]));
+#endif
+
+ return buf + (ptrdiff_t)i; /* TODO: vlaidate pointer arithmetic */
+}
diff --git a/src/unicode-escape.h b/src/unicode-escape.h
new file mode 100644
index 0000000..e089629
--- /dev/null
+++ b/src/unicode-escape.h
@@ -0,0 +1,121 @@
+/* Functions for prasing unicode escapes (\u, \U, \N).
+ Copyright (C) 2017 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+
+
+/*
+Contrived Usage Example:
+
+ // This function will be called on errors
+ void unicode_escape_error ( const char * err )
+ {
+ fprintf (stderr, "unicode conversion failed: %s\n", err);
+ exit (1);
+ }
+
+ char *p = "hello\N{U+03a8}world";
+ char *ep = NULL;
+ unsigned int uc = 0;
+
+ p = strchr (p, '\\')+1;
+
+ //
+ // Step 1: Parse the string, extract the unicode code point
+ //
+ p = parse_unicode_escape (&uc, p, strlen(p));
+ // uc == 0x03a8
+
+ //
+ // Step 2: convert unicode codepoint to multibyte string
+ //
+ char *outbuf = malloc (100);
+ char *op = outbuf;
+ op = store_unicode (uc, op, 100);
+
+
+
+All three parsing functions have the same interface:
+ parse_unicode_escape: Parse \N, \u, \U
+ parse_named_character: Parse only \N
+ parse_unicode_codepoint: Parse only \u, \U
+
+*/
+
+
+/*
+ */
+extern void
+unicode_escape_error (const char* msg);
+
+/* Parse \N{...} string.
+
+ buf should point to the 'N' character (past the backslash).
+ does not need to be NUL terminated.
+
+ buflen is the maximum number of octets to read.
+
+ 'uc' will contain the unicode point code.
+
+ Return position after the parsed string,
+ or NULL if parsing error occured.
+*/
+const char*
+parse_named_character (unsigned int /*OUTPUT*/ *uc,
+ const char* buf, ptrdiff_t buflen);
+
+
+
+/* Parse \uHHHH and \UHHHHHHHH strings.
+
+ buf should point to the 'u' (or 'U') character (past the backslash).
+ does not need to be NUL terminated.
+
+ buflen is the maximum number of octets to read.
+
+ 'uc' will contain the unicode point code.
+
+ Return position after the parsed string,
+ or NULL if parsing error occured.
+*/
+const char*
+parse_unicode_codepoint(unsigned int /*OUTPUT*/ *result_uc,
+ const char *buf, ptrdiff_t buflen);
+
+
+/*
+ Parses \N,\u,\U strings.
+
+ buf should point to the N/u/U character (past the backslash).
+ does not need to be NUL terminated.
+
+ buflen is the maximum number of octets to read.
+
+ 'uc' will contain the unicode point code.
+
+ Return position after the parsed string,
+ or NULL if parsing error occured.
+
+ Calls either parse_named_character() or parse_unicode_codepoint()
+ depending on the first character in 'buf'.
+*/
+const char*
+parse_unicode_escape (unsigned int /*OUTPUT*/ *result_uc,
+ const char *buf, ptrdiff_t buflen);
+
+
+char*
+store_unicode (unsigned int uc,
+ char *buf, ptrdiff_t buflen);