summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorA. Gordon <assafgordon@gmail.com>2014-06-21 02:36:39 (GMT)
committerA. Gordon <assafgordon@gmail.com>2014-06-21 02:36:39 (GMT)
commit7e1d4559ac5500073450d9dc69f63722d2175c32 (patch)
treea18f23e07a98ffff423ba61ace0a96428db25d9b
parentfca1bfec94b1ef787b72783fbdf59317f455967e (diff)
downloadagnostic-7e1d4559ac5500073450d9dc69f63722d2175c32.zip
agnostic-7e1d4559ac5500073450d9dc69f63722d2175c32.tar.gz
agnostic-7e1d4559ac5500073450d9dc69f63722d2175c32.tar.bz2
String-Utils: add REGEX conversion from BRE/ERE to Javascript syntax
-rw-r--r--Makefile5
-rw-r--r--src/node_modules/utils/string_utils.js341
-rw-r--r--src/tests/regex_utils_tester.js148
3 files changed, 494 insertions, 0 deletions
diff --git a/Makefile b/Makefile
index ee4610e..e59dde9 100644
--- a/Makefile
+++ b/Makefile
@@ -116,6 +116,7 @@ web: $(SHELL_PARSER) $(AGNOSTIC_BUNDLE_MINIFIED)
.PHONY: check
check: test_object_utils \
test_string_utils \
+ test_regex_utils \
test_strftime \
test_path_utils \
test_shellquote_utils \
@@ -274,6 +275,10 @@ test_object_utils:
test_string_utils:
$(NODEBIN) ./src/tests/string_utils_tester.js
+.PHONY: test_regex_utils
+test_regex_utils:
+ $(NODEBIN) ./src/tests/regex_utils_tester.js
+
.PHONY: test_path_utils
test_path_utils:
$(NODEBIN) ./src/tests/path_utils_tester.js
diff --git a/src/node_modules/utils/string_utils.js b/src/node_modules/utils/string_utils.js
index d860054..c74a5f7 100644
--- a/src/node_modules/utils/string_utils.js
+++ b/src/node_modules/utils/string_utils.js
@@ -271,4 +271,345 @@ this.parse_var_assignment = function(text)
};
}
+/* Javascript strangeness:
+ a = /[]/ => is valid ?
+ doesn't allow character classess (e.g. '[:alnum:]')
+ allow position/negative lookahead (?=p) (?:p)
+
+
+ TODO:
+ Prevent (by escaping) non-greedy expression,
+ which are meaningless in ERE but meaningful in Javascript
+*/
+
+function __convert_regex_POSIX_to_JS(text,type)
+{
+ var chars = text.split('');
+
+ var convert_regex_char;
+ var convert_regex_backlash_char;
+
+ if (type === "BRE") {
+ convert_regex_char = convert_BRE_char ;
+ convert_regex_backlash_char = convert_BRE_backslash_char ;
+ } else if (type === "ERE") {
+ convert_regex_char = function(ch) {return ch;} ;
+ convert_regex_backlash_char = convert_ERE_backslash_char ;
+ } else {
+ debugger;
+ throw new Error("internal error");
+ }
+
+ // Convert (if needed) characters which in BRE are NOT-escaped.
+ function convert_BRE_char(ch)
+ {
+ switch (ch)
+ {
+ // These characters have no special meaning in BRE,
+ // but do have special meaning in Javascript's RegExp - so escape them.
+ case '+':
+ case '?':
+ case '|':
+ case '{':
+ case '}':
+ case '(':
+ case ')':
+ return '\\' + ch;
+ default:
+ return ch;
+ }
+ }
+
+ // Convert (if needed) backslash characters which in BRE backslash-escaped
+ function convert_BRE_backslash_char(ch)
+ {
+ if (ch[0] !== '\\')
+ throw new Error('internal error');
+ if (ch.length<2)
+ throw new Error('internal error');
+
+ // in BRE, these have special meaning when backslash-escaped.
+ // In Javascript's RegExp, the meaning is reversed - so remove the backslash
+ // \(
+ // \)
+ // \{
+ // \}
+ // \?
+ // \\|
+ var tmp = ch[1];
+ if (tmp == '|' || tmp == '?' || tmp === '(' ||
+ tmp === ')' || tmp ==='}' || tmp ==='{')
+ return tmp;
+
+ // In BRE, backslash-character have no special meaning
+ // (officially: undefined meaning, but GNU grep treats '\d' as 'd' when in BRE).
+ // But in Javascript, these DO have special meaning. so double-escape them.
+ // e.g.
+ // echo d | grep '\d' => prints 'd'
+ if ( (tmp >= 'a' && tmp <= 'z') || ( tmp>='A' && tmp <='Z'))
+ return '\\\\' + tmp;
+
+ // Otherwise, return as is.
+ // (e.g. for "\1", but also other undefined tokens)
+ return ch;
+ }
+
+ // Convert (if needed) backslash characters which in ERE backslash-escaped
+ function convert_ERE_backslash_char(ch)
+ {
+ if (ch[0] !== '\\')
+ throw new Error('internal error');
+ if (ch.length<2)
+ throw new Error('internal error');
+
+ // In ERE, backslash-character have no special meaning
+ // (officially: undefined meaning, but GNU grep treats '\d' as 'd' when in BRE).
+ // But in Javascript, these DO have special meaning. so double-escape them.
+ // e.g.
+ // echo d | grep -E '\d' => prints 'd'
+ // echo 1 | grep -E '\d' => prints nothing
+ // echo d | grep -P '\d' => prints nothing
+ // echo 1 | grep -P '\d' => prints '1'
+ if ( (ch[1] >= 'a' && ch[1] <= 'z') || ( ch[1]>='A' && ch[1] <='Z'))
+ return '\\\\' + ch[1];
+
+ // Otherwise, return as is.
+ // (e.g. for "\1", but also other undefined tokens)
+ return ch;
+ }
+
+ // Convert Equivalent classes [=XXX=]
+ // Section 9.3.5 Item #5
+ function convert_bracket_equiv_class(name)
+ {
+ //If it's one character, pass it as-is, but convert it to
+ //explicit ASCII hex-code first.
+ //This is done to avoid tricky situations such as:
+ // '[a1[=]=]]'
+ // which is a valid regex (allow 'a' or '1' or ']'), but converting it to:
+ // '[a1]]'
+ // is not the same meaning. So we convert it to:
+ // '[a1\x5d]'
+ if (name.length===1)
+ return '\\x' + name.charCodeAt(0).toString(16);
+
+ //The locale we emulate doesn't support any equivalent classes
+ throw new Error("Invalid Equivalence class: [=" + name + "=]");
+ }
+
+ // Convert Collation Symbol [.XX.]
+ // Section 9.3.5 Item #6
+ function convert_bracket_collation_symbol(symb)
+ {
+ //If it's one character, pass it as-is, but convert it to
+ //explicit ASCII hex-code first.
+ //This is done to avoid tricky situations such as:
+ // '[a1[.].]]'
+ // which is a valid regex (allow 'a' or '1' or ']'), but converting it to:
+ // '[a1]]'
+ // is not the same meaning. So we convert it to:
+ // '[a1\x5d]'
+ if (symb.length===1)
+ return '\\x' + symb.charCodeAt(0).toString(16);
+
+ //The locale we emulate doesn't support any collation symbols.
+ throw new Error("Invalid collation symbol: [." + symb + ".]");
+ }
+
+ // Convert Character Expression Class [:alnum:]
+ // Section 9.3.5 Item #7,
+ // based on simplified "C" locale
+ function convert_bracket_char_class(name)
+ {
+ switch (name)
+ {
+ case 'alnum':
+ return 'A-Za-z0-9';
+ case 'alpha':
+ return 'A-Za-z';
+ case 'blank':
+ return ' \t';
+ case 'ctrl':
+ return '\x00-\x20\x7f';
+ case 'digit':
+ return '0-9';
+ case 'graph':
+ return '\x21-\x7e';
+ case 'lower':
+ return 'a-z';
+ case 'print':
+ return '\x20-\x7e';
+ case 'punct':
+ return '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~';
+ case 'space':
+ return ' \v\f\t\n\r';
+ case 'upper':
+ return 'A-Z';
+ case 'xdigit':
+ return 'A-Fa-f0-9';
+
+ default:
+ throw new Error("Invalid character class: [:" + name + ":]");
+ }
+ }
+
+
+ function scan_backslash()
+ {
+ var ch = chars.shift();
+ if (ch !== '\\')
+ throw new Error('internal error');
+
+ //TODO: return error?
+ if (chars.length===0)
+ throw new Error('trailing backslash');
+
+ var ch = chars.shift();
+ return '\\' + ch ;
+ }
+
+ /* Scan a character class expression inside a bracket expression.
+ Examples:
+ [ab[.c.]de]
+ Character classes are (see section "9.3.5 RE Bracket Expression"):
+ [.X.] => Collation Symbol
+ [=X=] => Equivalence class
+ [:X:] => character class
+ */
+ function scan_special_char_class()
+ {
+ var result = "";
+ var ok = false;
+ if (chars.length<2)
+ throw new Error('Internal error');
+ var ch = chars.shift();
+ if (ch !== '[')
+ throw new Error('internal error');
+ var type = chars.shift();
+ if (! (type === ':' || type ==='=' || type === '.') )
+ throw new Error('internal error');
+
+ while (chars.length>=2) {
+ if (chars[0] === type && chars[1] === ']') {
+ chars.shift();
+ chars.shift();
+ ok = true;
+ break;
+ }
+ result += chars.shift();
+ }
+
+ if (!ok)
+ throw new Error('Unmatched character classs [' + type);
+
+ if (type === ':')
+ return convert_bracket_char_class(result);
+ else if (type === '.')
+ return convert_bracket_collation_symbol(result);
+ else
+ return convert_bracket_equiv_class(result);
+ }
+
+ /* scan characteres inside a bracket expression.
+ In a bracket expression, all characters lose their special meaning,
+ except right-bracket (in non-first position),
+ and character classes expression ( [:alnum:] ). */
+ function scan_bracket_content_expression()
+ {
+ var result = "";
+ var peek_next_token;
+ while (chars.length > 0) {
+ // Check for special bracket expressions
+ if ( chars[0]==='[' && chars.length>=2 &&
+ ( chars[1]==='=' || chars[1] === ':' || chars[1] ==='.' ) ) {
+ result += scan_special_char_class() ;
+ continue;
+ }
+
+ // otherwise, scan any chararcter EXCEPT closing right-bracket
+ // (all other chararcters loose their speacil meaning in a bracket expression)
+ peek_next_token = chars[0];
+ // Inside a bracket expression, a right-bracket ends the expression
+ if (peek_next_token == ']')
+ break;
+
+
+ chars.shift(); //consume token
+ result += peek_next_token;
+ }
+ return result;
+ }
+
+ /* Scans a bracket expression, starting with '[' ending with ']' */
+ function scan_bracket_expression()
+ {
+ var ch = chars.shift();
+ var invert = "" ;
+ var right_bracket = "";
+ if (ch !== '[')
+ throw new Error('internal error');
+
+ //Negation, allowed as optional first character in bracket expression
+ if (chars.length>0 && chars[0] === '^')
+ invert = chars.shift();
+
+ //Right-closing bracket loses its meaning if its the FIRST character
+ //in a bracket-expression (or after optional ^)
+ // (Section 9.3.5 item #1)
+ if (chars.length>0 && chars[0] === ']')
+ right_bracket = chars.shift();
+
+ var bracket_content = scan_bracket_content_expression()
+
+ if (chars.length===0 || chars.shift() !== ']' )
+ throw new Error("unmatched [ or [^");
+
+ return '[' + invert + right_bracket + bracket_content + ']';
+ }
+
+ /* Scan a regular expression, in BRE or ERE syntax,
+ and convert to Javascript-compatible Regexp with the same semantics */
+ function scan_regexp()
+ {
+ var result = "";
+ while (chars.length > 0) {
+ var peek_next_token = chars[0];
+ switch (peek_next_token)
+ {
+ case '\\':
+ result += convert_regex_backlash_char(scan_backslash());
+ break;
+
+ case '[':
+ result += scan_bracket_expression();
+ break;
+
+ default:
+ chars.shift(); //consume token
+ result += convert_regex_char(peek_next_token);
+ break;
+ }
+ }
+ return result;
+ }
+
+ return scan_regexp();
+}
+
+/* Given a regex string in BRE syntax, returns a regex string
+ * which can be used with Javascript's RegExp and have the same semantics as the BRE string */
+function regex_BRE_to_JS(text)
+{
+ return __convert_regex_POSIX_to_JS(text,"BRE");
+}
+this.regex_BRE_to_JS = regex_BRE_to_JS;
+
+/* Given a regex string in BRE syntax, returns a regex string
+ * which can be used with Javascript's RegExp and have the same semantics as the BRE string */
+function regex_ERE_to_JS(text)
+{
+ return __convert_regex_POSIX_to_JS(text,"ERE");
+}
+this.regex_ERE_to_JS = regex_ERE_to_JS;
+
}
diff --git a/src/tests/regex_utils_tester.js b/src/tests/regex_utils_tester.js
new file mode 100644
index 0000000..8328302
--- /dev/null
+++ b/src/tests/regex_utils_tester.js
@@ -0,0 +1,148 @@
+/****************************************
+ * This file is part of UNIX Guide for the Perplexed project.
+ * Copyright (C) 2014 by Assaf Gordon <assafgordon@gmail.com>
+ * Released under GPLv3 or later.
+ ****************************************/
+
+/* Tests BRE/ERE convertor */
+
+var assert = require('assert');
+var ob_utils = require('utils/object_utils');
+var _ = require('utils/string_utils');
+
+var BRE_tests = [
+// BRE syntax Javascript-compatible syntax
+[ "(hello)", "\\(hello\\)" ],
+[ "\\(hello\\)", "(hello)" ],
+[ "h*", "h*" ],
+[ "h?", "h\\?" ],
+[ "h\\?", "h?" ],
+[ "h+", "h\\+" ],
+[ "a\\|b", "a|b"],
+[ "a\\{3\\}", "a{3}" ],
+[ "a{3}", "a\\{3\\}" ],
+[ "[a]", "[a]" ],
+
+//NOTE: inside bracket expressions, ) has no special menaning,
+// so no need to escape it.
+[ "[)]", "[)]" ],
+[ "[+]", "[+]" ],
+[ "[}]", "[}]" ],
+
+// Javascript doesn't have character classes, so expand them explicitly
+[ "[[:alnum:]]", "[A-Za-z0-9]" ],
+[ "[[:digit:]]", "[0-9]" ],
+[ "[[:xdigit:]]", "[A-Fa-f0-9]" ],
+
+//Javascript doesn't have collaiton of equivalent classes,
+//But Agnostic currently only supports "C" locale -
+//so accept single character classes, but reject others
+//NOTE:
+// To avoid messy extreme cases (e.g. "[.].]",
+// which implies a closing right-bracket in the middle of a bracket-expression)
+// those are converted to hex-character.
+[ "[ab[.c.]d]", "[ab\\x63d]" ],
+[ "[ab[=c=]d]", "[ab\\x63d]" ],
+[ "[ab[.].]d]", "[ab\\x5dd]" ],
+];
+
+var ERE_tests = [
+// ERE syntax Javascript-compatible syntax
+[ "(hello)", "(hello)" ],
+[ "\\(hello\\)", "\\(hello\\)" ],
+[ "h*", "h*" ],
+[ "h?", "h?" ],
+[ "h+", "h+" ],
+[ "a|b", "a|b"],
+[ "a{3}", "a{3}" ],
+[ "h\\*", "h\\*" ],
+[ "h\\?", "h\\?" ],
+[ "h\\+", "h\\+" ],
+[ "a\\|b", "a\\|b"],
+[ "a\\{3\\}", "a\\{3\\}" ],
+
+//In javascript (and perl) regex, \d,\w,\s etc have special meaning.
+//In ERE, they have none. So escape the backslash to make them non-special.
+[ "\\d", "\\\\d" ],
+[ "\\s", "\\\\s" ],
+[ "\\w", "\\\\w" ],
+[ "\\D", "\\\\D" ],
+[ "\\S", "\\\\S" ],
+[ "\\W", "\\\\W" ],
+
+//NOTE:
+//not testing bracket expressions again, they are the same in BRE and ERE.
+];
+
+
+var regex_tests = [
+
+//TODO
+//Add many more tests
+
+//name type syntax input expected result
+// if true/false - runs 'test'.
+// if object, runs 'match' and deepEquals the result
+["b1", "BRE", "^a", "aaa", true],
+["b2", "BRE", "^a", "baa", false],
+["b3", "BRE", "a+", "aaa", false],
+["e3", "ERE", "a+", "aaa", true],
+["b4", "BRE", "a|b", "a", false],
+["b5", "BRE", "a|b", "b", false],
+["b6", "BRE", "a|b", "a|b", true],
+["b7", "BRE", "a\\|b", "a", true],
+["b7", "BRE", "a\\|b", "b", true],
+["e7", "ERE", "a|b", "a", true],
+["e8", "ERE", "a|b", "b", true],
+["e9", "ERE", "a\\|b", "a", false],
+["e10", "ERE", "a\\|b", "b", false],
+["e11", "ERE", "a\\|b", "a|b", true],
+
+];
+
+
+BRE_tests.forEach(function(e){
+ var bre_syntax = e[0];
+ var js_syntax = e[1];
+
+ assert.equal( _.regex_BRE_to_JS(bre_syntax), js_syntax );
+});
+
+
+ERE_tests.forEach(function(e){
+ var ere_syntax = e[0];
+ var js_syntax = e[1];
+
+ assert.equal( _.regex_ERE_to_JS(ere_syntax), js_syntax );
+});
+
+regex_tests.forEach(function(test){
+ var name = test[0];
+ var type = test[1];
+ var regex = test[2];
+ var input = test[3];
+ var expect = test[4];
+
+ switch (type)
+ {
+ case 'BRE':
+ regex = _.regex_BRE_to_JS(regex);
+ break;
+ case 'ERE':
+ regex = _.regex_ERE_to_JS(regex);
+ break;
+ case 'JS':
+ break;
+ default:
+ throw new Error("unknown regex type '" + type + "'");
+ }
+
+ var re = new RegExp(regex);
+ if (ob_utils.IsBoolean(expect)) {
+ var result = re.test(input);
+ if ( result !== expect )
+ console.error("regex test '" + name + "' failed");
+ assert ( result === expect );
+ } else {
+ }
+});