summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2017-01-16 20:13:50 (GMT)
committerPaul Eggert <eggert@cs.ucla.edu>2017-01-17 16:02:05 (GMT)
commitaa1f107a143900d2d9a9d8c3aa541671140315ba (patch)
tree6d96789c20d185fc64dd290390ee79a5487db76e
parente0cb70cc85aac558106d0c66adff239c97aba563 (diff)
downloadgrep-aa1f107a143900d2d9a9d8c3aa541671140315ba.zip
grep-aa1f107a143900d2d9a9d8c3aa541671140315ba.tar.gz
grep-aa1f107a143900d2d9a9d8c3aa541671140315ba.tar.bz2
Improve -i performance in typical UTF-8 searches
Currently ‘grep -i i’ is slow in a UTF-8 locale, because ‘i’ in the pattern matches the two-byte character 'ı' (U+0131, LATIN SMALL LETTER DOTLESS I) in data, and kwset handles only single-byte character translations, so grep falls back on a slower DFA-based search for all searches. Improve -i performance in the typical case by using kwset when data are free of troublesome characters like 'ı', falling back on the DFA only when data contain troublesome characters. * src/dfasearch.c (GEAcompile): * src/grep.c (compile_fp_t): * src/kwsearch.c (Fcompile): * src/pcresearch.c (Pcompile): Pattern arg is now char *, not char const *, since Fcompile now reallocates it sometimes. * src/grep.c (all_single_byte_after_folding): Remove. All callers removed. (fgrep_icase_charlen): New function. (fgrep_icase_available, try_fgrep_pattern): Use it, for more-generous semantics. (fgrep_to_grep_pattern): Now extern. (main): Do not free keys, since Fexecute may use them. * src/kwsearch.c (struct kwsearch): New struct. (Fcompile): Return it. If -i, be more generous about patterns. (Fexecute): Use it. Fall back on DFA when the data contain troublesome characters; this should be rare in practice. * src/kwset.c, src/kwset.h (kwswords): New function.
-rw-r--r--src/dfasearch.c2
-rw-r--r--src/grep.c93
-rw-r--r--src/kwsearch.c92
-rw-r--r--src/kwset.c6
-rw-r--r--src/kwset.h1
-rw-r--r--src/pcresearch.c2
-rw-r--r--src/search.h7
7 files changed, 153 insertions, 50 deletions
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 9b32d48..542e7f1 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -104,7 +104,7 @@ kwsmusts (struct dfa_comp *dc)
}
void *
-GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
+GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
{
char *motif;
struct dfa_comp *dc = xcalloc (1, sizeof (*dc));
diff --git a/src/grep.c b/src/grep.c
index 0a674ec..81654c3 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -575,7 +575,7 @@ static bool seek_failed;
static bool seek_data_failed;
/* Functions we'll use to search. */
-typedef void *(*compile_fp_t) (char const *, size_t, reg_syntax_t);
+typedef void *(*compile_fp_t) (char *, size_t, reg_syntax_t);
typedef size_t (*execute_fp_t) (void *, char const *, size_t, size_t *,
char const *);
static execute_fp_t execute;
@@ -2254,54 +2254,58 @@ contains_encoding_error (char const *pat, size_t patlen)
return false;
}
-/* Return true if the set of single-byte characters USED contains only
- characters whose case counterparts are also single-byte. */
+/* Return the number of bytes in the initial character of PAT, of size
+ PATLEN, if Fcompile can handle that character. Return -1 if
+ Fcompile cannot handle it. MBS is the multibyte conversion state.
-static bool
-all_single_byte_after_folding (bool used[UCHAR_MAX + 1])
-{
- for (int c = 0; c <= UCHAR_MAX; c++)
- if (used[c])
- {
- wint_t wc = localeinfo.sbctowc[c];
- wchar_t folded[CASE_FOLDED_BUFSIZE];
- size_t nfolded = case_folded_counterparts (wc, folded);
-
- for (size_t i = 0; i < nfolded; i++)
- {
- char s[MB_LEN_MAX];
- mbstate_t mb_state = { 0 };
- if (1 < wcrtomb (s, folded[i], &mb_state))
- return false;
- }
- }
+ Fcompile can handle a character C if C is single-byte, or if C has no
+ case folded counterparts and toupper translates none of its bytes. */
- return true;
+static int
+fgrep_icase_charlen (char const *pat, size_t patlen, mbstate_t *mbs)
+{
+ int n = localeinfo.sbclen[to_uchar (*pat)];
+ if (n < 0)
+ {
+ wchar_t wc;
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ size_t wn = mbrtowc (&wc, pat, patlen, mbs);
+ if (MB_LEN_MAX < wn || case_folded_counterparts (wc, folded))
+ return -1;
+ for (int i = wn; 0 < --i; )
+ {
+ unsigned char c = pat[i];
+ if (toupper (c) != c)
+ return -1;
+ }
+ n = wn;
+ }
+ return n;
}
-/* Return true if the -F patterns PAT, of size PATLEN, match only
- single-byte characters including case folding, and so can be
- processed by the -F matcher even if -i is given. */
+/* Return true if the -F patterns PAT, of size PATLEN, contain only
+ single-byte characters or characters not subject to case folding,
+ and so can be processed by Fcompile. */
static bool
fgrep_icase_available (char const *pat, size_t patlen)
{
- bool used[UCHAR_MAX + 1] = { 0, };
+ mbstate_t mbs = {0,};
- for (size_t i = 0; i < patlen; i++)
+ for (size_t i = 0; i < patlen; )
{
- unsigned char c = pat[i];
- if (localeinfo.sbctowc[c] == WEOF)
+ int n = fgrep_icase_charlen (pat + i, patlen - i, &mbs);
+ if (n < 0)
return false;
- used[c] = true;
+ i += n;
}
- return all_single_byte_after_folding (used);
+ return true;
}
/* Change the pattern *KEYS_P, of size *LEN_P, from fgrep to grep style. */
-static void
+void
fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
{
size_t len = *len_p;
@@ -2358,8 +2362,6 @@ try_fgrep_pattern (int matcher, char *keys, size_t *len_p)
char *new_keys = xmalloc (len + 1);
char *p = new_keys;
mbstate_t mb_state = { 0 };
- size_t mblen_lim = match_icase ? 1 : -3;
- bool used[UCHAR_MAX + 1] = { 0, };
while (len != 0)
{
@@ -2396,19 +2398,27 @@ try_fgrep_pattern (int matcher, char *keys, size_t *len_p)
}
{
- size_t n = mb_clen (keys, len, &mb_state);
- if (mblen_lim < n)
- goto fail;
- used[to_uchar (keys[0])] = true;
+ size_t n;
+ if (match_icase)
+ {
+ int ni = fgrep_icase_charlen (keys, len, &mb_state);
+ if (ni < 0)
+ goto fail;
+ n = ni;
+ }
+ else
+ {
+ n = mb_clen (keys, len, &mb_state);
+ if (MB_LEN_MAX < n)
+ goto fail;
+ }
+
p = mempcpy (p, keys, n);
keys += n;
len -= n;
}
}
- if (mblen_lim == 1 && !all_single_byte_after_folding (used))
- goto fail;
-
if (*len_p != p - new_keys)
{
*len_p = p - new_keys;
@@ -2878,7 +2888,6 @@ main (int argc, char **argv)
execute = matchers[matcher].execute;
compiled_pattern = matchers[matcher].compile (keys, keycc,
matchers[matcher].syntax);
- free (keys);
/* We need one byte prior and one after. */
char eolbytes[3] = { 0, eolbyte, 0 };
size_t match_size;
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 5e9df16..30a027c 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -21,8 +21,33 @@
#include <config.h>
#include "search.h"
+/* A compiled -F pattern list. */
+
+struct kwsearch
+{
+ /* The kwset for this pattern list. */
+ kwset_t kwset;
+
+ /* The number of user-specified patterns. This is less than
+ 'kwswords (kwset)' when some extra one-character words have been
+ appended, one for each troublesome character that will require a
+ DFA search. */
+ ptrdiff_t words;
+
+ /* The user's pattern and its size in bytes. */
+ char *pattern;
+ size_t size;
+
+ /* The user's pattern compiled as a regular expression,
+ or null if it has not been compiled. */
+ void *re;
+};
+
+/* Compile the -F style PATTERN, containing SIZE bytes. Return a
+ description of the compiled pattern. */
+
void *
-Fcompile (char const *pattern, size_t size, reg_syntax_t ignored)
+Fcompile (char *pattern, size_t size, reg_syntax_t ignored)
{
kwset_t kwset;
ptrdiff_t total = size;
@@ -74,11 +99,55 @@ Fcompile (char const *pattern, size_t size, reg_syntax_t ignored)
while (p);
free (buf);
+ ptrdiff_t words = kwswords (kwset);
+
+ if (match_icase)
+ {
+ /* For each pattern character C that has a case folded
+ counterpart F that is multibyte and so cannot easily be
+ implemented via translating a single byte, append a pattern
+ containing just F. That way, if the data contains F, the
+ matcher can fall back on DFA. For example, if C is 'i' and
+ the locale is en_US.utf8, append a pattern containing just
+ the character U+0131 (LATIN SMALL LETTER DOTLESS I), so that
+ Fexecute will use a DFA if the data contain U+0131. */
+ mbstate_t mbs = { 0 };
+ char checked[NCHAR] = {0,};
+ for (p = pattern; p < pattern + size; p++)
+ {
+ unsigned char c = *p;
+ if (checked[c])
+ continue;
+ checked[c] = true;
+
+ wint_t wc = localeinfo.sbctowc[c];
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+
+ for (int i = case_folded_counterparts (wc, folded); 0 <= --i; )
+ {
+ char s[MB_LEN_MAX];
+ int nbytes = wcrtomb (s, folded[i], &mbs);
+ if (1 < nbytes)
+ kwsincr (kwset, s, nbytes);
+ }
+ }
+ }
+
kwsprep (kwset);
- return kwset;
+ struct kwsearch *kwsearch = xmalloc (sizeof *kwsearch);
+ kwsearch->kwset = kwset;
+ kwsearch->words = words;
+ kwsearch->pattern = pattern;
+ kwsearch->size = size;
+ kwsearch->re = NULL;
+ return kwsearch;
}
+/* Use the compiled pattern VCP to search the buffer BUF of size SIZE.
+ If found, return the offset of the first match and store its
+ size into *MATCH_SIZE. If not found, return SIZE_MAX.
+ If START_PTR is nonnull, start searching there. */
size_t
Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
char const *start_ptr)
@@ -90,7 +159,8 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
size_t ret_val;
bool mb_check;
bool longest;
- kwset_t kwset = vcp;
+ struct kwsearch *kwsearch = vcp;
+ kwset_t kwset = kwsearch->kwset;
if (match_lines)
mb_check = longest = false;
@@ -108,6 +178,22 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
if (offset < 0)
break;
len = kwsmatch.size[0] - 2 * match_lines;
+
+ if (kwsearch->words <= kwsmatch.index)
+ {
+ /* The data contain a multibyte character that matches
+ some pattern character that is a case folded counterpart.
+ Since the kwset code cannot handle this case, fall back
+ on the DFA code, which can. */
+ if (! kwsearch->re)
+ {
+ fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
+ kwsearch->re = GEAcompile (kwsearch->pattern, kwsearch->size,
+ RE_SYNTAX_GREP);
+ }
+ return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
+ }
+
if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0)
{
/* We have matched a single byte that is not at the beginning of a
diff --git a/src/kwset.c b/src/kwset.c
index 0b6fab4..77ef7c7 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -328,6 +328,12 @@ kwsincr (kwset_t kwset, char const *text, ptrdiff_t len)
kwset->maxd = trie->depth;
}
+ptrdiff_t
+kwswords (kwset_t kwset)
+{
+ return kwset->words;
+}
+
/* Enqueue the trie nodes referenced from the given tree in the
given queue. */
static void
diff --git a/src/kwset.h b/src/kwset.h
index ef78db3..5c78e54 100644
--- a/src/kwset.h
+++ b/src/kwset.h
@@ -36,6 +36,7 @@ typedef struct kwset *kwset_t;
extern kwset_t kwsalloc (char const *, bool);
extern void kwsincr (kwset_t, char const *, ptrdiff_t);
+extern ptrdiff_t kwswords (kwset_t) _GL_ATTRIBUTE_PURE;
extern void kwsprep (kwset_t);
extern ptrdiff_t kwsexec (kwset_t, char const *, ptrdiff_t,
struct kwsmatch *, bool)
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 28144b4..703498c 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -91,7 +91,7 @@ jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes,
#endif
void *
-Pcompile (char const *pattern, size_t size, reg_syntax_t ignored)
+Pcompile (char *pattern, size_t size, reg_syntax_t ignored)
{
#if !HAVE_LIBPCRE
die (EXIT_TROUBLE, 0,
diff --git a/src/search.h b/src/search.h
index c8bd5f2..8533d59 100644
--- a/src/search.h
+++ b/src/search.h
@@ -55,19 +55,20 @@ extern size_t wordchar_prev (char const *, char const *, char const *)
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
/* dfasearch.c */
-extern void *GEAcompile (char const *, size_t, reg_syntax_t);
+extern void *GEAcompile (char *, size_t, reg_syntax_t);
extern size_t EGexecute (void *, char const *, size_t, size_t *, char const *);
/* kwsearch.c */
-extern void *Fcompile (char const *, size_t, reg_syntax_t);
+extern void *Fcompile (char *, size_t, reg_syntax_t);
extern size_t Fexecute (void *, char const *, size_t, size_t *, char const *);
/* pcresearch.c */
-extern void *Pcompile (char const *, size_t, reg_syntax_t);
+extern void *Pcompile (char *, size_t, reg_syntax_t);
extern size_t Pexecute (void *, char const *, size_t, size_t *, char const *);
/* grep.c */
extern struct localeinfo localeinfo;
+extern void fgrep_to_grep_pattern (char **, size_t *);
/* Return the number of bytes in the character at the start of S, which
is of size N. N must be positive. MBS is the conversion state.