summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--node.c32
-rw-r--r--support/Makefile.am5
-rw-r--r--support/Makefile.in9
-rw-r--r--support/c-ctype.h37
-rw-r--r--support/u8-uctomb.c71
-rw-r--r--support/unicode-escape.c251
-rw-r--r--support/unicode-escape.h121
-rw-r--r--support/unistr.h11
8 files changed, 536 insertions, 1 deletions
diff --git a/node.c b/node.c
index 97f65fa..11ffed0 100644
--- a/node.c
+++ b/node.c
@@ -27,6 +27,7 @@
#include "awk.h"
#include "math.h"
#include "floatmagic.h" /* definition of isnan */
+#include "unicode-escape.h"
static int is_ieee_magic_val(const char *val);
static NODE *r_make_number(double x);
@@ -372,6 +373,15 @@ cmp_awknums(const NODE *t1, const NODE *t2)
}
+/* Called from unicode-escape.c functions if
+ \u,\U,\N failed to parse */
+void
+unicode_escape_error (const char* msg)
+{
+ warning (msg);
+}
+
+
/* make_str_node --- make a string node */
NODE *
@@ -425,6 +435,27 @@ make_str_node(const char *s, size_t len, int flags)
c = *pf++;
if (c == '\\') {
+ if ( *pf == 'u' || *pf == 'U' || *pf == 'N')
+ {
+ /* Special handling for \u and \U
+ which can generate more than one output octet*/
+ unsigned int uc;
+ char* p;
+
+ p = (char*) parse_unicode_escape (&uc, pf,
+ strlen (pf));
+ if (p)
+ {
+ pf = p;
+
+ p = store_unicode(uc, ptm, end-ptm);
+ if (p)
+ ptm = p;
+ }
+ }
+ else
+ {
+ /* All other escape sequences */
c = parse_escape(&pf);
if (c < 0) {
if (do_lint)
@@ -432,6 +463,7 @@ make_str_node(const char *s, size_t len, int flags)
c = '\\';
}
*ptm++ = c;
+ }
} else
*ptm++ = c;
}
diff --git a/support/Makefile.am b/support/Makefile.am
index 0e19876..a5571a8 100644
--- a/support/Makefile.am
+++ b/support/Makefile.am
@@ -39,6 +39,7 @@ EXTRA_DIST = \
# what to make and install
noinst_LIBRARIES = libsupport.a
libsupport_a_SOURCES = \
+ c-ctype.h \
dfa.c \
dfa.h \
getopt.c \
@@ -52,6 +53,10 @@ libsupport_a_SOURCES = \
random.h \
regex.c \
regex.h \
+ unistr.h \
+ u8-uctomb.c \
+ unicode-escape.h \
+ unicode-escape.c \
verify.h \
xalloc.h
diff --git a/support/Makefile.in b/support/Makefile.in
index 13913f6..5627047 100644
--- a/support/Makefile.in
+++ b/support/Makefile.in
@@ -142,7 +142,7 @@ libsupport_a_AR = $(AR) $(ARFLAGS)
libsupport_a_LIBADD =
am_libsupport_a_OBJECTS = dfa.$(OBJEXT) getopt.$(OBJEXT) \
getopt1.$(OBJEXT) localeinfo.$(OBJEXT) random.$(OBJEXT) \
- regex.$(OBJEXT)
+ regex.$(OBJEXT) u8-uctomb.$(OBJEXT) unicode-escape.$(OBJEXT)
libsupport_a_OBJECTS = $(am_libsupport_a_OBJECTS)
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
@@ -347,6 +347,7 @@ EXTRA_DIST = \
# what to make and install
noinst_LIBRARIES = libsupport.a
libsupport_a_SOURCES = \
+ c-ctype.h \
dfa.c \
dfa.h \
getopt.c \
@@ -360,6 +361,10 @@ libsupport_a_SOURCES = \
random.h \
regex.c \
regex.h \
+ unistr.h \
+ u8-uctomb.c \
+ unicode-escape.h \
+ unicode-escape.c \
verify.h \
xalloc.h
@@ -420,6 +425,8 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/localeinfo.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/random.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/regex.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/u8-uctomb.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unicode-escape.Po@am__quote@
.c.o:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
diff --git a/support/c-ctype.h b/support/c-ctype.h
new file mode 100644
index 0000000..196736d
--- /dev/null
+++ b/support/c-ctype.h
@@ -0,0 +1,37 @@
+/* Lifted from gnulib's c-ctype.h */
+
+#ifndef C_CTYPE_H
+#define C_CTYPE_H
+
+#include <stdbool.h>
+
+inline bool
+c_isxdigit (int c)
+{
+ switch (c)
+ {
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ case '8': case '9':
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+ return true;
+ default:
+ return false;
+ }
+}
+
+inline bool
+c_isspace (int c)
+{
+ switch (c)
+ {
+ case ' ': case '\t': case '\n': case '\v': case '\f': case '\r':
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+#endif /* C_CTYPE_H */
diff --git a/support/u8-uctomb.c b/support/u8-uctomb.c
new file mode 100644
index 0000000..ac864ea
--- /dev/null
+++ b/support/u8-uctomb.c
@@ -0,0 +1,71 @@
+/* Store a character in UTF-8 string.
+ Copyright (C) 2002, 2005-2006, 2009-2017 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2002.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#if defined IN_LIBUNISTRING
+/* Tell unistr.h to declare u8_uctomb as 'extern', not 'static inline'. */
+# include "unistring-notinline.h"
+#endif
+
+/* Specification. */
+#include "unistr.h"
+
+int
+u8_uctomb (uint8_t *s, ucs4_t uc, int n)
+{
+ if (uc < 0x80)
+ {
+ if (n > 0)
+ {
+ s[0] = uc;
+ return 1;
+ }
+ /* else return -2, below. */
+ }
+ else
+ {
+ int count;
+
+ if (uc < 0x800)
+ count = 2;
+ else if (uc < 0x10000)
+ {
+ if (uc < 0xd800 || uc >= 0xe000)
+ count = 3;
+ else
+ return -1;
+ }
+ else if (uc < 0x110000)
+ count = 4;
+ else
+ return -1;
+
+ if (n >= count)
+ {
+ switch (count) /* note: code falls through cases! */
+ {
+ case 4: s[3] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x10000;
+ case 3: s[2] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x800;
+ case 2: s[1] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0xc0;
+ /*case 1:*/ s[0] = uc;
+ }
+ return count;
+ }
+ }
+ return -2;
+}
diff --git a/support/unicode-escape.c b/support/unicode-escape.c
new file mode 100644
index 0000000..3a885e9
--- /dev/null
+++ b/support/unicode-escape.c
@@ -0,0 +1,251 @@
+#include <config.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <c-ctype.h>
+#include <unistr.h>
+#include <assert.h>
+
+#include "unicode-escape.h"
+
+/* #define UNICODE_ESCAPE_DEBUG */
+
+#ifdef UNICODE_ESCAPE_DEBUG
+#include <stdio.h>
+#endif
+
+#define _(x) (x)
+
+/* Bound on the length of a Unicode character name. As of
+ Unicode 9.0.0 the maximum is 83, so this should be safe. */
+enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
+
+/* Copied from coreutils' system.h */
+static inline unsigned char to_uchar (char ch) { return ch; }
+
+/* Copied form GNU coreutils' "printf.c:print_esc()" */
+#define hextobin(c) ((c) >= 'a' && (c) <= 'f' ? (c) - 'a' + 10 : \
+ (c) >= 'A' && (c) <= 'F' ? (c) - 'A' + 10 : (c) - '0')
+
+
+/* Parse named character escape sequences \N{x} */
+const char*
+parse_named_character (unsigned int /*OUTPUT*/ *uc,
+ const char* buf, ptrdiff_t buflen)
+{
+ /* Lifted from Emacs' ./src/lread.c:read_escape() */
+ unsigned char c;
+ bool whitespace = false;
+ char name[UNICODE_CHARACTER_NAME_LENGTH_BOUND + 1];
+ ptrdiff_t length = 0;
+
+ /* String doesn't start with 'N' - that's a programming error */
+ assert (buflen > 0);
+ assert (*buf == 'N');
+
+ if (buflen<2)
+ {
+ unicode_escape_error (_("incomplete \\N{} sequence"));
+ return NULL;
+ }
+
+ ++buf;
+ if (*buf++ != '{')
+ {
+ unicode_escape_error (_("missing '{' after \\N"));
+ return NULL;
+ }
+
+ buflen -= 2;
+
+ while (true)
+ {
+ if (--buflen<0)
+ {
+ unicode_escape_error (_("incomplete \\N{} sequence"));
+ return NULL;
+ }
+
+
+ c = to_uchar(*buf++);
+ if (!c)
+ {
+ unicode_escape_error (_("incomplete \\N{} sequence"));
+ return NULL;
+ }
+
+ if (c == '}')
+ break;
+
+ if (! (0 < c && c < 0x80))
+ {
+ unicode_escape_error (_("invaid character in \\N escape sequence"));
+ return NULL;
+ }
+
+ /* Treat multiple adjacent whitespace characters as a
+ single space character. This makes it easier to use
+ character names in e.g. multi-line strings. */
+ if (c_isspace (c))
+ {
+ if (whitespace)
+ continue;
+ c = ' ';
+ whitespace = true;
+ }
+ else
+ whitespace = false;
+
+ name[length++] = c;
+ if (length >= sizeof name)
+ {
+ unicode_escape_error (_("character name too long in \\N sequence"));
+ return NULL;
+ }
+ }
+
+ if (length == 0)
+ {
+ unicode_escape_error (_("Empty character name in \\N sequence"));
+ return NULL;
+ }
+
+ name[length] = '\0';
+
+#ifdef UNICODE_ESCAPE_DEBUG
+ fprintf (stderr,"parsed name character: \\N{%s}\n", name);
+#endif
+
+ /* Convert U+XXXX to a number */
+ if (name[0] == 'U' && name[1] == '+')
+ {
+ /* For "U+XXXX", pass the leading '+' to string_to_number to reject
+ monstrosities like "U+-0000". */
+ errno = 0;
+ char *ep;
+ unsigned long l = strtoul (name+1, &ep, 16);
+#ifdef UNICODE_ESCAPE_DEBUG
+ fprintf (stderr,"parsed U+XXXX value: 0x%0lx\n", l);
+#endif
+ if (errno != 0 || l == 0 || l > 0x10FFFF)
+ {
+ unicode_escape_error (_("Invalid value for \\N{U+XXXX} sequence"));
+ return NULL;
+ }
+
+ if (uc)
+ *uc = l;
+ }
+ else
+ {
+ /* FIXME, use gnulib's 'uniname' module, store value in '*uc' */
+ unicode_escape_error (_("Named conversion not implemented yet. " \
+ "Please use \\N{U+XXXX}"));
+ return NULL;
+ }
+
+ return buf;
+}
+
+
+
+/* Convert Unicode escape sequences \uHHHH \UHHHHHHHH */
+
+const char*
+parse_unicode_codepoint (unsigned int *uc,
+ const char *buf, ptrdiff_t buflen)
+{
+ /* String doesn't start with 'u/U' - that's a programming error */
+ assert (buflen>0);
+ assert (*buf == 'u' || *buf == 'U');
+
+ /* Copied form GNU coreutils' "printf.c:print_esc()" */
+ const char *p = buf;
+ const char esc_char = *p;
+ unsigned int uni_value = 0 ;
+ int esc_length = (esc_char == 'u' ? 4 : 8);
+
+ /* return the correct error for either 'U' or 'u'.
+ This will help the user know if 4 or 8 hexdigits
+ were expected. */
+ const char *err = (esc_char == 'u') \
+ ? _("missing hexdigits in \\uHHHH")
+ : _("missing hexdigits in \\UHHHHHHHH");
+
+ if (buflen < (esc_length+1))
+ {
+ unicode_escape_error (err);
+ return NULL;
+ }
+
+ for (++p;
+ esc_length > 0;
+ --esc_length, ++p)
+ {
+ if (! c_isxdigit (to_uchar (*p)))
+ {
+ unicode_escape_error (err);
+ return NULL;
+ }
+
+
+ uni_value = uni_value * 16 + hextobin (*p);
+ }
+
+ if (esc_length>0)
+ {
+ unicode_escape_error (err);
+ return NULL;
+ }
+
+
+#ifdef UNICODE_ESCAPE_DEBUG
+ fprintf(stderr,"Parsed \%c+%x value\n", esc_char, uni_value);
+#endif
+
+ if (uc)
+ *uc = uni_value;
+
+ return p;
+}
+
+
+const char*
+parse_unicode_escape (unsigned int /*OUTPUT*/ *uc,
+ const char *buf, ptrdiff_t buflen)
+{
+ assert ( buflen > 0 );
+ const char c = *buf;
+ assert ( c == 'N' || c == 'u' || c == 'U' );
+
+ if (c=='N')
+ return parse_named_character (uc, buf, buflen);
+ else
+ return parse_unicode_codepoint (uc, buf, buflen);
+}
+
+
+char*
+store_unicode (ucs4_t uc, char *buf, ptrdiff_t buflen)
+{
+ assert(buflen>=4); /* FIXME: allow smaller buffers? */
+
+ /* FIXME: convert ucs4_t to current locale,
+ instead of utf-8, possibly using gnulib's
+ unicodeio.c:unicode_to_mb() */
+ int i = u8_uctomb ( (uint8_t*)buf, uc, buflen);
+
+ if (i<1)
+ {
+ unicode_escape_error (_("invalid/forbidden unicode value"));
+ return NULL;
+ }
+
+#ifdef UNICODE_ESCAPE_DEBUG
+ fprintf(stderr,"store_uncode(uc = 0x%x) returned:\n", uc);
+ for (int j=0; j<i; ++j)
+ fprintf(stderr, " buf[%d]=0x%02x \n", j, to_uchar(buf[j]));
+#endif
+
+ return buf + (ptrdiff_t)i; /* TODO: vlaidate pointer arithmetic */
+}
diff --git a/support/unicode-escape.h b/support/unicode-escape.h
new file mode 100644
index 0000000..e089629
--- /dev/null
+++ b/support/unicode-escape.h
@@ -0,0 +1,121 @@
+/* Functions for prasing unicode escapes (\u, \U, \N).
+ Copyright (C) 2017 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+
+
+/*
+Contrived Usage Example:
+
+ // This function will be called on errors
+ void unicode_escape_error ( const char * err )
+ {
+ fprintf (stderr, "unicode conversion failed: %s\n", err);
+ exit (1);
+ }
+
+ char *p = "hello\N{U+03a8}world";
+ char *ep = NULL;
+ unsigned int uc = 0;
+
+ p = strchr (p, '\\')+1;
+
+ //
+ // Step 1: Parse the string, extract the unicode code point
+ //
+ p = parse_unicode_escape (&uc, p, strlen(p));
+ // uc == 0x03a8
+
+ //
+ // Step 2: convert unicode codepoint to multibyte string
+ //
+ char *outbuf = malloc (100);
+ char *op = outbuf;
+ op = store_unicode (uc, op, 100);
+
+
+
+All three parsing functions have the same interface:
+ parse_unicode_escape: Parse \N, \u, \U
+ parse_named_character: Parse only \N
+ parse_unicode_codepoint: Parse only \u, \U
+
+*/
+
+
+/*
+ */
+extern void
+unicode_escape_error (const char* msg);
+
+/* Parse \N{...} string.
+
+ buf should point to the 'N' character (past the backslash).
+ does not need to be NUL terminated.
+
+ buflen is the maximum number of octets to read.
+
+ 'uc' will contain the unicode point code.
+
+ Return position after the parsed string,
+ or NULL if parsing error occured.
+*/
+const char*
+parse_named_character (unsigned int /*OUTPUT*/ *uc,
+ const char* buf, ptrdiff_t buflen);
+
+
+
+/* Parse \uHHHH and \UHHHHHHHH strings.
+
+ buf should point to the 'u' (or 'U') character (past the backslash).
+ does not need to be NUL terminated.
+
+ buflen is the maximum number of octets to read.
+
+ 'uc' will contain the unicode point code.
+
+ Return position after the parsed string,
+ or NULL if parsing error occured.
+*/
+const char*
+parse_unicode_codepoint(unsigned int /*OUTPUT*/ *result_uc,
+ const char *buf, ptrdiff_t buflen);
+
+
+/*
+ Parses \N,\u,\U strings.
+
+ buf should point to the N/u/U character (past the backslash).
+ does not need to be NUL terminated.
+
+ buflen is the maximum number of octets to read.
+
+ 'uc' will contain the unicode point code.
+
+ Return position after the parsed string,
+ or NULL if parsing error occured.
+
+ Calls either parse_named_character() or parse_unicode_codepoint()
+ depending on the first character in 'buf'.
+*/
+const char*
+parse_unicode_escape (unsigned int /*OUTPUT*/ *result_uc,
+ const char *buf, ptrdiff_t buflen);
+
+
+char*
+store_unicode (unsigned int uc,
+ char *buf, ptrdiff_t buflen);
diff --git a/support/unistr.h b/support/unistr.h
new file mode 100644
index 0000000..47af312
--- /dev/null
+++ b/support/unistr.h
@@ -0,0 +1,11 @@
+#ifndef __UNISTR__
+#define __UNISTR__
+
+typedef unsigned char uint8_t;
+typedef unsigned int ucs4_t;
+
+extern int
+ u8_uctomb (uint8_t *s, ucs4_t uc, int n);
+
+
+#endif