summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2017-01-12 06:28:24 (GMT)
committerPaul Eggert <eggert@cs.ucla.edu>2017-01-12 06:29:19 (GMT)
commit3438c3a65c655baed1bb764e41d7ddcced5f1e7c (patch)
treedabbf3a7e928ed0c129965cf94bdf5ba4fbda3c4
parent3cd2e8625ae7c0608b91af5d2fff9c1c41eedbea (diff)
downloadgrep-3438c3a65c655baed1bb764e41d7ddcced5f1e7c.zip
grep-3438c3a65c655baed1bb764e41d7ddcced5f1e7c.tar.gz
grep-3438c3a65c655baed1bb764e41d7ddcced5f1e7c.tar.bz2
grep: improve comments, mostly in kwset
Remove kwset.h comments that are obsolete and seemingly not maintained anyway; people can look in kwset.c instead. Update comments to reflect current behavior better. Cite Faro & Lecroq 2013. Use GNU style for end-of-sentence.
-rw-r--r--src/kwset.c167
-rw-r--r--src/kwset.h27
-rw-r--r--src/searchutils.c2
3 files changed, 97 insertions, 99 deletions
diff --git a/src/kwset.c b/src/kwset.c
index 20a1284..0c22041 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -17,19 +17,29 @@
Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
02110-1301, USA. */
-/* Written August 1989 by Mike Haertel.
- The author may be reached (Email) at the address mike@ai.mit.edu,
- or (US mail) as Mike Haertel c/o Free Software Foundation. */
+/* Written August 1989 by Mike Haertel. */
-/* The algorithm implemented by these routines bears a startling resemblance
- to one discovered by Beate Commentz-Walter, although it is not identical.
+/* The algorithm called "Commentz_Walter" below bears a startling resemblance
+ to that of Beate Commentz-Walter, although it is not identical.
See: Commentz-Walter B. A string matching algorithm fast on the average.
Lecture Notes in Computer Science 71 (1979), 118-32
<http://dx.doi.org/10.1007/3-540-09510-1_10>.
- See also: Aho AV, Corasick MJ. Efficient string matching: an aid to
+
+ For the Aho-Corasick algorithm, see:
+ Aho AV, Corasick MJ. Efficient string matching: an aid to
bibliographic search. CACM 18, 6 (1975), 333-40
<http://dx.doi.org/10.1145/360825.360855>, which describes the
- failure function used below. */
+ failure function used below.
+
+ For the Boyer-Moore algorithm, see: Boyer RS, Moore JS.
+ A fast string searching algorithm. CACM 20, 10 (1977), 762-72
+ <http://dx.doi.org/10.1145/359842.359859>.
+
+ For a survey of more-recent string matching algorithms that might
+ help improve performance, see: Faro S, Lecroq T. The exact online
+ string matching problem: a review of the most recent results.
+ ACM Computing Surveys 45, 2 (2013), 13
+ <http://dx.doi.org/10.1145/2431211.2431212>. */
#include <config.h>
@@ -52,42 +62,47 @@ U (char ch)
return to_uchar (ch);
}
-/* Balanced tree of edges and labels leaving a given trie node. */
+/* Balanced tree of edges and labels leaving a given trie node. */
struct tree
{
- struct tree *llink; /* Left link; MUST be first field. */
- struct tree *rlink; /* Right link (to larger labels). */
- struct trie *trie; /* Trie node pointed to by this edge. */
- unsigned char label; /* Label on this edge. */
- char balance; /* Difference in depths of subtrees. */
+ struct tree *llink; /* Left link; MUST be first field. */
+ struct tree *rlink; /* Right link (to larger labels). */
+ struct trie *trie; /* Trie node pointed to by this edge. */
+ unsigned char label; /* Label on this edge. */
+ char balance; /* Difference in depths of subtrees. */
};
-/* Node of a trie representing a set of keywords. */
+/* Node of a trie representing a set of keywords. */
struct trie
{
- size_t accepting; /* Word index of accepted word, or zero. */
- struct tree *links; /* Tree of edges leaving this node. */
- struct trie *parent; /* Parent of this node. */
- struct trie *next; /* List of all trie nodes in level order. */
- struct trie *fail; /* Aho-Corasick failure function. */
- ptrdiff_t depth; /* Depth of this node from the root. */
- ptrdiff_t shift; /* Shift function for search failures. */
- ptrdiff_t maxshift; /* Max shift of self and descendants. */
+ /* If an accepting node, this is either 2*W + 1 where W is the word
+ index, or is SIZE_MAX if Commentz-Walter is in use and FAIL
+ specifies where to look for more info. If not an accepting node,
+ this is zero. */
+ size_t accepting;
+
+ struct tree *links; /* Tree of edges leaving this node. */
+ struct trie *parent; /* Parent of this node. */
+ struct trie *next; /* List of all trie nodes in level order. */
+ struct trie *fail; /* Aho-Corasick failure function. */
+ ptrdiff_t depth; /* Depth of this node from the root. */
+ ptrdiff_t shift; /* Shift function for search failures. */
+ ptrdiff_t maxshift; /* Max shift of self and descendants. */
};
-/* Structure returned opaquely to the caller, containing everything. */
+/* Structure returned opaquely to the caller, containing everything. */
struct kwset
{
- struct obstack obstack; /* Obstack for node allocation. */
- ptrdiff_t words; /* Number of words in the trie. */
- struct trie *trie; /* The trie itself. */
- ptrdiff_t mind; /* Minimum depth of an accepting node. */
- ptrdiff_t maxd; /* Maximum depth of any node. */
- unsigned char delta[NCHAR]; /* Delta table for rapid search. */
- struct trie *next[NCHAR]; /* Table of children of the root. */
- char *target; /* Target string if there's only one. */
- ptrdiff_t *shift; /* Used in Boyer-Moore search for one string. */
- char const *trans; /* Character translation table. */
+ struct obstack obstack; /* Obstack for node allocation. */
+ ptrdiff_t words; /* Number of words in the trie. */
+ struct trie *trie; /* The trie itself. */
+ ptrdiff_t mind; /* Minimum depth of an accepting node. */
+ ptrdiff_t maxd; /* Maximum depth of any node. */
+ unsigned char delta[NCHAR]; /* Delta table for rapid search. */
+ struct trie *next[NCHAR]; /* Table of children of the root. */
+ char *target; /* Target string if there's only one. */
+ ptrdiff_t *shift; /* Used in Boyer-Moore search for one string. */
+ char const *trans; /* Character translation table. */
/* If there's only one string, this is the string's last byte,
translated via TRANS if TRANS is nonnull. */
@@ -107,8 +122,8 @@ struct kwset
all match when case-folding. */
int gc1help;
- /* If true, prefer Aho-Corasick algorithm to Beate Commentz-Walter
- algorithm in multiple words. */
+ /* If true, prefer Aho-Corasick to Commentz-Walter when searching
+ for multiple words. */
bool reverse;
/* kwsexec implementation. */
@@ -126,8 +141,10 @@ static size_t acexec (kwset_t, char const *, size_t, struct kwsmatch *, bool);
static size_t cwexec (kwset_t, char const *, size_t, struct kwsmatch *, bool);
static size_t bmexec (kwset_t, char const *, size_t, struct kwsmatch *, bool);
-/* Allocate and initialize a keyword set object, returning an opaque
- pointer to it. */
+/* Return a newly allocated keyword set. A nonnull TRANS specifies a
+ table of character translations to be applied to all pattern and
+ search text. If REVERSE, prefer the Aho-Corasick algorithm to the
+ Commentz-Walter algorithm. */
kwset_t
kwsalloc (char const *trans, bool reverse)
{
@@ -154,7 +171,7 @@ kwsalloc (char const *trans, bool reverse)
}
/* This upper bound is valid for CHAR_BIT >= 4 and
- exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */
+ exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */
enum { DEPTH_SIZE = CHAR_BIT + CHAR_BIT / 2 };
/* Add the given string to the contents of the keyword set. */
@@ -168,7 +185,7 @@ kwsincr (kwset_t kwset, char const *text, size_t len)
text += len;
/* Descend the trie (built of keywords) character-by-character,
- installing new nodes when necessary. */
+ installing new nodes when necessary. */
while (len--)
{
unsigned char uc = kwset->reverse ? *--text : *text++;
@@ -176,7 +193,7 @@ kwsincr (kwset_t kwset, char const *text, size_t len)
/* Descend the tree of outgoing links for this trie node,
looking for the current character and keeping track
- of the path followed. */
+ of the path followed. */
struct tree *cur = trie->links;
struct tree *links[DEPTH_SIZE];
enum { L, R } dirs[DEPTH_SIZE];
@@ -195,7 +212,7 @@ kwsincr (kwset_t kwset, char const *text, size_t len)
/* The current character doesn't have an outgoing link at
this trie node, so build a new trie node and install
- a link in the current trie node's tree. */
+ a link in the current trie node's tree. */
if (!cur)
{
cur = obstack_alloc (&kwset->obstack, sizeof *cur);
@@ -212,13 +229,13 @@ kwsincr (kwset_t kwset, char const *text, size_t len)
cur->label = label;
cur->balance = 0;
- /* Install the new tree node in its parent. */
+ /* Install the new tree node in its parent. */
if (dirs[--depth] == L)
links[depth]->llink = cur;
else
links[depth]->rlink = cur;
- /* Back up the tree fixing the balance flags. */
+ /* Back up the tree fixing the balance flags. */
while (depth && !links[depth]->balance)
{
if (dirs[depth] == L)
@@ -228,7 +245,7 @@ kwsincr (kwset_t kwset, char const *text, size_t len)
--depth;
}
- /* Rebalance the tree by pointer rotations if necessary. */
+ /* Rebalance the tree by pointer rotations if necessary. */
if (depth && ((dirs[depth] == L && --links[depth]->balance)
|| (dirs[depth] == R && ++links[depth]->balance)))
{
@@ -290,13 +307,13 @@ kwsincr (kwset_t kwset, char const *text, size_t len)
trie = cur->trie;
}
- /* Mark the node we finally reached as accepting, encoding the
- index number of this word in the keyword set so far. */
+ /* Mark the node finally reached as accepting, encoding the
+ index number of this word in the keyword set so far. */
if (!trie->accepting)
trie->accepting = 1 + 2 * kwset->words;
++kwset->words;
- /* Keep track of the longest and shortest string of the keyword set. */
+ /* Keep track of the longest and shortest string of the keyword set. */
if (trie->depth < kwset->mind)
kwset->mind = trie->depth;
if (trie->depth > kwset->maxd)
@@ -304,7 +321,7 @@ kwsincr (kwset_t kwset, char const *text, size_t len)
}
/* Enqueue the trie nodes referenced from the given tree in the
- given queue. */
+ given queue. */
static void
enqueue (struct tree *tree, struct trie **last)
{
@@ -317,7 +334,7 @@ enqueue (struct tree *tree, struct trie **last)
/* Compute the Aho-Corasick failure function for the trie nodes referenced
from the given tree, given the failure function for their parent as
- well as a last resort failure node. */
+ well as a last resort failure node. */
static void
treefails (struct tree const *tree, struct trie const *fail,
struct trie *recourse, bool reverse)
@@ -331,7 +348,7 @@ treefails (struct tree const *tree, struct trie const *fail,
treefails (tree->rlink, fail, recourse, reverse);
/* Find, in the chain of fails going back to the root, the first
- node that has a descendant on the current label. */
+ node that has a descendant on the current label. */
while (fail)
{
cur = fail->links;
@@ -354,7 +371,7 @@ treefails (struct tree const *tree, struct trie const *fail,
}
/* Set delta entries for the links of the given tree such that
- the preexisting delta value is larger than the current depth. */
+ the preexisting delta value is larger than the current depth. */
static void
treedelta (struct tree const *tree, ptrdiff_t depth, unsigned char delta[])
{
@@ -366,7 +383,7 @@ treedelta (struct tree const *tree, ptrdiff_t depth, unsigned char delta[])
delta[tree->label] = depth;
}
-/* Return true if A has every label in B. */
+/* Return true if A has every label in B. */
static bool _GL_ATTRIBUTE_PURE
hasevery (struct tree const *a, struct tree const *b)
{
@@ -385,7 +402,7 @@ hasevery (struct tree const *a, struct tree const *b)
}
/* Compute a vector, indexed by character code, of the trie nodes
- referenced from the given tree. */
+ referenced from the given tree. */
static void
treenext (struct tree const *tree, struct trie *next[])
{
@@ -396,8 +413,7 @@ treenext (struct tree const *tree, struct trie *next[])
next[tree->label] = tree->trie;
}
-/* Compute the shift for each trie node, as well as the delta
- table and next cache for the given keyword set. */
+/* Prepare a built keyword set for use. */
void
kwsprep (kwset_t kwset)
{
@@ -436,7 +452,7 @@ kwsprep (kwset_t kwset)
/* Initial values for the delta table; will be changed later. The
delta entry for a given character is the smallest depth of any
- node at which an outgoing edge is labeled by that character. */
+ node at which an outgoing edge is labeled by that character. */
memset (delta, MIN (kwset->mind, UCHAR_MAX), sizeof deltabuf);
/* Traverse the nodes of the trie in level order, simultaneously
@@ -501,11 +517,11 @@ kwsprep (kwset_t kwset)
for (i = 0; i < NCHAR; ++i)
kwset->next[i] = next[U(trans[i])];
- /* Check if we can use the simple boyer-moore algorithm, instead
- of the hairy commentz-walter algorithm. */
+ /* Decide whether to use the simple Boyer-Moore algorithm, instead
+ of the hairy Commentz-Walter algorithm. */
if (kwset->words == 1)
{
- /* Looking for just one string. Extract it from the trie. */
+ /* Looking for just one string. Extract it from the trie. */
kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)
{
@@ -513,7 +529,7 @@ kwsprep (kwset_t kwset)
curr = curr->next;
}
- /* Looking for the delta2 shift that we might make after a
+ /* Looking for the delta2 shift that might be made after a
backwards match has failed. Extract it from the trie. */
if (kwset->mind > 1)
{
@@ -549,7 +565,7 @@ kwsprep (kwset_t kwset)
kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]);
}
- /* Fix things up for any translation table. */
+ /* Fix things up for any translation table. */
if (trans)
for (i = 0; i < NCHAR; ++i)
kwset->delta[i] = delta[U(trans[i])];
@@ -682,10 +698,10 @@ bmexec_trans (kwset_t kwset, char const *text, size_t size)
char gc1 = kwset->gc1;
char gc2 = kwset->gc2;
- /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
+ /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
ptrdiff_t len12;
if (!INT_MULTIPLY_WRAPV (len, 12, &len12) && len12 < size)
- /* 11 is not a bug, the initial offset happens only once. */
+ /* 11 is not a bug, the initial offset happens only once. */
for (ep = text + size - 11 * len; tp <= ep; )
{
char const *tp0 = tp;
@@ -725,8 +741,8 @@ bmexec_trans (kwset_t kwset, char const *text, size_t size)
return tp - text;
}
- /* Now we have only a few characters left to search. We
- carefully avoid ever producing an out-of-bounds pointer. */
+ /* Now only a few characters are left to search. Carefully avoid
+ ever producing an out-of-bounds pointer. */
ep = text + size;
d = d1[U(tp[-1])];
while (d <= ep - tp)
@@ -746,8 +762,8 @@ static size_t
bmexec (kwset_t kwset, char const *text, size_t size,
struct kwsmatch *kwsmatch, bool longest)
{
- /* Help the compiler inline bmexec_trans in two ways, depending on
- whether kwset->trans is null. */
+ /* Help the compiler inline in two ways, depending on whether
+ kwset->trans is null. */
size_t ret = (kwset->trans
? bmexec_trans (kwset, text, size)
: bmexec_trans (kwset, text, size));
@@ -762,7 +778,7 @@ bmexec (kwset_t kwset, char const *text, size_t size,
return ret;
}
-/* Hairy multiple string search with Commentz-Walter algorithm. */
+/* Hairy multiple string search with the Commentz-Walter algorithm. */
static size_t
cwexec (kwset_t kwset, char const *text, size_t len,
struct kwsmatch *kwsmatch, bool longest)
@@ -857,7 +873,7 @@ cwexec (kwset_t kwset, char const *text, size_t len,
match:
/* Given a known match, find the longest possible match anchored
at or before its starting point. This is nearly a verbatim
- copy of the preceding main search loops. */
+ copy of the preceding main search loops. */
if (longest)
{
if (lim - mch > kwset->maxd)
@@ -920,7 +936,7 @@ cwexec (kwset_t kwset, char const *text, size_t len,
return mch - text;
}
-/* Hairy multiple string search with Aho-Corasick algorithm.
+/* Hairy multiple string search with the Aho-Corasick algorithm.
(inlinable version) */
static inline size_t
acexec_trans (kwset_t kwset, char const *text, size_t len,
@@ -1034,18 +1050,19 @@ static size_t
acexec (kwset_t kwset, char const *text, size_t size,
struct kwsmatch *kwsmatch, bool longest)
{
- /* Help the compiler inline bmexec_trans in two ways, depending on
- whether kwset->trans is null. */
+ /* Help the compiler inline in two ways, depending on whether
+ kwset->trans is null. */
return (kwset->trans
? acexec_trans (kwset, text, size, kwsmatch, longest)
: acexec_trans (kwset, text, size, kwsmatch, longest));
}
-/* Search TEXT for a match of any member of KWSET.
+/* Find the first instance of a KWSET member in TEXT, which has SIZE bytes.
Return the offset (into TEXT) of the first byte of the matching substring,
or SIZE_MAX if no match is found. Upon a match, store details in
*KWSMATCH: index of matched keyword, start offset (same as the return
- value), and length. */
+ value), and length. If LONGEST, find the longest match; otherwise
+ any match will do. */
size_t
kwsexec (kwset_t kwset, char const *text, size_t size,
struct kwsmatch *kwsmatch, bool longest)
@@ -1053,7 +1070,7 @@ kwsexec (kwset_t kwset, char const *text, size_t size,
return kwset->kwsexec (kwset, text, size, kwsmatch, longest);
}
-/* Free the components of the given keyword set. */
+/* Free the components of the given keyword set. */
void
kwsfree (kwset_t kwset)
{
diff --git a/src/kwset.h b/src/kwset.h
index b03b71a..e9fa3dc 100644
--- a/src/kwset.h
+++ b/src/kwset.h
@@ -17,18 +17,16 @@
Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
02110-1301, USA. */
-/* Written August 1989 by Mike Haertel.
- The author may be reached (Email) at the address mike@ai.mit.edu,
- or (US mail) as Mike Haertel c/o Free Software Foundation. */
+/* Written August 1989 by Mike Haertel. */
#include <stddef.h>
#include <stdbool.h>
struct kwsmatch
{
- size_t index; /* Index number of matching keyword. */
- size_t offset[1]; /* Offset of each submatch. */
- size_t size[1]; /* Length of each submatch. */
+ size_t index; /* Index number of matching keyword. */
+ size_t offset[1]; /* Offset of match. */
+ size_t size[1]; /* Length of match. */
};
#include "arg-nonnull.h"
@@ -36,26 +34,9 @@ struct kwsmatch
struct kwset;
typedef struct kwset *kwset_t;
-/* Return an opaque pointer to a newly allocated keyword set. A nonnull arg
- specifies a table of character translations to be applied to all
- pattern and search text. */
extern kwset_t kwsalloc (char const *, bool);
-
-/* Incrementally extend the keyword set to include the given string.
- Remember an index number for each keyword included in the set. */
extern void kwsincr (kwset_t, char const *, size_t);
-
-/* When the keyword set has been completely built, prepare it for use. */
extern void kwsprep (kwset_t);
-
-/* Search through the given buffer for a member of the keyword set.
- Return a pointer to the leftmost longest match found, or NULL if
- no match is found. If foundlen is non-NULL, store the length of
- the matching substring in the integer it points to. Similarly,
- if foundindex is non-NULL, store the index of the particular
- keyword found therein. */
extern size_t kwsexec (kwset_t, char const *, size_t, struct kwsmatch *, bool)
_GL_ARG_NONNULL ((4));
-
-/* Deallocate the given keyword set and all its associated storage. */
extern void kwsfree (kwset_t);
diff --git a/src/searchutils.c b/src/searchutils.c
index 51a3097..e0a94e2 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -133,7 +133,7 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
return p == cur ? 0 : cur - p0;
}
-/* Examine the start of BUF (of size SIZE) for word constituents.
+/* Examine the start of BUF (which goes to END) for word constituents.
If COUNTALL, examine as many as possible; otherwise, examine at most one.
Return the total number of bytes in the examined characters. */
static size_t