summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorA. Gordon <assafgordon@gmail.com>2014-11-17 22:20:58 (GMT)
committerA. Gordon <assafgordon@gmail.com>2014-11-17 22:20:58 (GMT)
commita2a39c54a5cea228ebdf50642b00852ee5d71736 (patch)
tree39c9fdc822805d106b68d1b80091880d6a0dd5d9
parente2ce8a02b809898830bb7978f998f7a4e030c7ae (diff)
downloadagnostic-shell-grammer.zip
agnostic-shell-grammer.tar.gz
agnostic-shell-grammer.tar.bz2
Jison-based parsershell-grammer
-rw-r--r--shell-jison/main.js30
-rw-r--r--shell-jison/shell-lexer.js238
-rw-r--r--shell-jison/shell-parser.jison231
-rw-r--r--shell-jison/shell-parser.orig.y220
4 files changed, 719 insertions, 0 deletions
diff --git a/shell-jison/main.js b/shell-jison/main.js
new file mode 100644
index 0000000..e608a0d
--- /dev/null
+++ b/shell-jison/main.js
@@ -0,0 +1,30 @@
+/*
+ *
+ */
+var fs = require("fs");
+var jison = require("jison");
+var bnf = fs.readFileSync("shell-parser.jison", "utf8");
+var lexer = require("./shell-lexer.js");
+
+jison.print = function() {};
+
+var parser = new jison.Parser(bnf);
+parser.lexer = new lexer();
+
+ var input = "" ;
+ process.argv.forEach(function (val, index, array) {
+ /* The first two parameters are "nodejs" and the script name */
+ if (index<2)
+ return;
+ if (input !== "")
+ input = input + " ";
+ input = input + array[index];
+ });
+
+if (input == "") {
+ console.log('Usage: shell-scanner SHELL-COMMAND');
+ process.exit(1);
+}
+var source = input;
+var result = parser.parse(source);
+console.log(JSON.stringify(result));
diff --git a/shell-jison/shell-lexer.js b/shell-jison/shell-lexer.js
new file mode 100644
index 0000000..7e45fec
--- /dev/null
+++ b/shell-jison/shell-lexer.js
@@ -0,0 +1,238 @@
+"use strict";
+
+// See: http://zaach.github.io/jison/docs/ ( section "custom scanners" )
+
+function AlphabetScanner() {
+ var text = "";
+ this.yytext = "";
+ this.yyloc = {
+ first_column: 1,
+ first_line: 1,
+ last_line: 1,
+ last_column: 1
+ };
+ this.yylloc = this.yyloc;
+ this.setInput = function(text_) {
+ text = text_;
+ };
+
+ this.peekChar = function() {
+ return text.charAt(0);
+ }
+ this.consumeChar = function() {
+ var c = text.charAt(0);
+ text = text.substring(1);
+ if (c==="\n") {
+ this.yyloc.first_column = 1 ;
+ this.yyloc.last_column = 1 ;
+ this.yyloc.first_line++;
+ this.yyloc.last_line++;
+ } else {
+ this.yyloc.first_column++;
+ this.yyloc.last_column++;
+ }
+ }
+ this.log = function() {
+ var args = Array.prototype.slice.call(arguments);
+ //console.log(args);
+ }
+
+
+ /* returns true if character C is a valid first character of a shell
+ operator.
+ These are the valid shell operations:
+ && || ;; << >> <& >& <> <<- >|
+ NOTE:
+ The characters "{ } !" are not operators - they are resevered words.
+ */
+ this.operator_first_char = function(c) {
+ switch (c)
+ {
+ case '&':
+ case '|':
+ case ';':
+ case '<':
+ case '>':
+ return true;
+ }
+ return false;
+ }
+ /* Given an operator string, returns the corresponding
+ parser token, or null.
+ NOTE:
+ The token strings must match the ones defined in
+ 'shell.js' (generated from 'shell.jison').
+ */
+ this.get_operator_token = function(str) {
+ switch(str)
+ {
+ case '&&': return "AND_IF";
+ case '||': return 'OR_IF';
+ case ';;': return 'DSEMI';
+ case '<<': return 'DLESS';
+ case '>>': return 'DGREAT';
+ case '<&': return 'LESSAND';
+ case '>&': return 'GREATAND';
+ case '<>': return 'LESSGREAT';
+ case '<<-': return 'DLESSDASH';
+ case '>|': return 'CLOBBER';
+ case '>': return '>';
+ case '<': return '<';
+ case ';': return ';';
+ case '&': return '&';
+ case '|': return '|';
+ case '(': return '(';
+ case ')': return ')';
+ }
+ return null;
+ }
+ /* Checking condition of Token Recognition (seciotn 2.3),
+ item 2:
+ If the previous character was used as part of an operator and the
+ current character is not quoted and can be used with the current
+ characters to form an operator, it shall be used as part of that
+ (operator) token.
+ */
+ this.part_of_token = function(current_token,peeked_char) {
+ var result = false;
+ if (this.operator_first_char(current_token)) {
+ /* a valid string of an operator? */
+ var test = current_token + peeked_char;
+ var tmp = this.get_operator_token( test );
+ result = ( tmp !== null );
+ }
+ else if ((current_token==="<<") && (peeked_char==="-")) {
+ /* Special case for three-character operator */
+ result = true;
+ }
+ if (0) {
+ this.log("scanner: part_of_token('" + current_token + "','" +
+ peeked_char + "') = " + result);
+ }
+ return result;
+ }
+
+ /* Helper function to log and set yytext */
+ this.return_token = function(token,value) {
+ this.log("scanner: returning token '" + token +
+ "' value='" + value + "'");
+ this.yytext = value;
+ return token;
+ }
+
+ this.lex = function() {
+
+ // Consume a single character and increment our column numbers.
+ this.log("scanner: lex() called");
+
+ this.yytext = "";
+ this.yyloc.first_column++;
+ this.yyloc.last_column++;
+
+ var current_token = null;
+ var current_token_text = "";
+
+ // "[...] process shall continue until an actual token is delimited."
+ while (1) {
+ /* Item 1: end of input */
+ if (text === "") {
+ // Return the EOF token when we run out of text.
+ if (current_token===null) {
+ this.log("scanner: returning EOF");
+ return;
+ }
+ return this.return_token(current_token,current_token_text);
+ }
+
+ var peek = this.peekChar();
+ if (0) {
+ this.log("scanner: peeked character = '" + peek + "' " +
+ "current_token = '" + current_token + "' " +
+ "current_token_text = '" + current_token_text + "'");
+ }
+
+ var b = this.part_of_token(current_token_text,peek);
+ /* Item 2: part of an operator? */
+ if (b) {
+ this.consumeChar();
+ current_token_text += peek;
+ var tmp = this.get_operator_token(current_token_text);
+ if (tmp) { current_token = tmp ; }
+ continue;
+ }
+
+
+ /* Item 3: if previous token is an operator,
+ and new character is not,
+ then return the previous token.
+ NOTE: the new character is not consumed, and will
+ be used in the next iteration. */
+ var c = this.get_operator_token(current_token_text);
+ if (!b && (c!==null)) {
+ return this.return_token(c,current_token_text);
+ }
+
+ /* Item 4: TODO */
+
+ /* Item 5: TODO */
+
+ /* Item 6: new operator? return current token */
+ c = this.get_operator_token(peek);
+ if ( c !== null) {
+ /* If there's a previous token, return it,
+ without consuming the peeked character. it will be
+ handled on the next iteration. */
+ if (current_token !== null) {
+ return this.return_token(current_token,current_token_text);
+ }
+ current_token = c;
+ current_token_text = peek;
+ this.consumeChar();
+ continue;
+ }
+
+ /* Item 7: New line */
+ if (peek=== "\n") {
+ if (current_token !== null) {
+ return this.return_token(current_token,current_token_text);
+ }
+ else {
+ return this.return_token('NEWLINE',"\n");
+ }
+ }
+
+ /* Item 8: blanks */
+ if (peek=== " " || peek==="\t") {
+ this.consumeChar();
+ if (current_token !== null) {
+ return this.return_token(current_token,current_token_text);
+ }
+ continue;
+ }
+
+ /* Item 9: append to word */
+ if (current_token=="WORD") {
+ this.consumeChar();
+ current_token_text += peek;
+ continue;
+ }
+
+ /* Item 10: comments: TODO */
+
+ /* Item 11: new word */
+ if (current_token===null) {
+ this.consumeChar();
+ current_token = "WORD";
+ current_token_text += peek;
+ continue;
+ } else {
+ throw "scanner internal error 54351:" +
+ "current_token = '" + current_token + "' " +
+ "current_token_text = '" + current_token_text + "' " +
+ "peek = '" + peek + "'" ;
+ }
+ }
+ };
+}
+
+module.exports = AlphabetScanner;
diff --git a/shell-jison/shell-parser.jison b/shell-jison/shell-parser.jison
new file mode 100644
index 0000000..1ef7d60
--- /dev/null
+++ b/shell-jison/shell-parser.jison
@@ -0,0 +1,231 @@
+/* The POSIX Shell Grammar, copied from:
+
+The Open Group Base Specifications Issue 6
+IEEE Std 1003.1, 2004 Edition
+Copyright © 2001-2004 The IEEE and The Open Group, All Rights reserved.
+ 2. Shell Command Language
+ 2.3 Token Recognition
+
+http://pubs.opengroup.org/onlinepubs/009604599/utilities/xcu_chap02.html#tag_02_03
+*/
+
+/* -------------------------------------------------------
+ The grammar symbols
+ ------------------------------------------------------- */
+
+
+%token WORD
+%token ASSIGNMENT_WORD
+%token NAME
+%token NEWLINE
+%token IO_NUMBER
+
+
+/* The following are the operators mentioned above. */
+
+
+%token AND_IF OR_IF DSEMI
+/* '&&' '||' ';;' */
+
+
+%token DLESS DGREAT LESSAND GREATAND LESSGREAT DLESSDASH
+/* '<<' '>>' '<&' '>&' '<>' '<<-' */
+
+
+%token CLOBBER
+/* '>|' */
+
+
+/* The following are the reserved words. */
+
+
+%token If Then Else Elif Fi Do Done
+/* 'if' 'then' 'else' 'elif' 'fi' 'do' 'done' */
+
+
+%token Case Esac While Until For
+/* 'case' 'esac' 'while' 'until' 'for' */
+
+
+/* These are reserved words, not operator tokens, and are
+ recognized when reserved words are recognized. */
+
+
+%token Lbrace Rbrace Bang
+/* '{' '}' '!' */
+
+
+%token In
+/* 'in' */
+
+
+/* -------------------------------------------------------
+ The Grammar
+ ------------------------------------------------------- */
+
+
+%start complete_command
+%%
+complete_command : list separator { return $1 ; }
+ | list { return $1 ; }
+ ;
+list : list separator_op and_or
+ | and_or
+ ;
+and_or : pipeline
+ | and_or AND_IF linebreak pipeline
+ | and_or OR_IF linebreak pipeline
+ ;
+pipeline : pipe_sequence
+ | Bang pipe_sequence
+ ;
+pipe_sequence : command
+ | pipe_sequence '|' linebreak command
+ ;
+command : simple_command
+ | compound_command
+ | compound_command redirect_list
+ | function_definition
+ ;
+compound_command : brace_group
+ | subshell
+ | for_clause
+ | case_clause
+ | if_clause
+ | while_clause
+ | until_clause
+ ;
+subshell : '(' compound_list ')'
+ ;
+compound_list : term
+ | newline_list term
+ | term separator
+ | newline_list term separator
+ ;
+term : term separator and_or
+ | and_or
+ ;
+for_clause : For name linebreak do_group
+ | For name linebreak in sequential_sep do_group
+ | For name linebreak in wordlist sequential_sep do_group
+ ;
+name : NAME /* Apply rule 5 */
+ ;
+in : In /* Apply rule 6 */
+ ;
+wordlist : wordlist WORD
+ | WORD
+ ;
+case_clause : Case WORD linebreak in linebreak case_list Esac
+ | Case WORD linebreak in linebreak case_list_ns Esac
+ | Case WORD linebreak in linebreak Esac
+ ;
+case_list_ns : case_list case_item_ns
+ | case_item_ns
+ ;
+case_list : case_list case_item
+ | case_item
+ ;
+case_item_ns : pattern ')' linebreak
+ | pattern ')' compound_list linebreak
+ | '(' pattern ')' linebreak
+ | '(' pattern ')' compound_list linebreak
+ ;
+case_item : pattern ')' linebreak DSEMI linebreak
+ | pattern ')' compound_list DSEMI linebreak
+ | '(' pattern ')' linebreak DSEMI linebreak
+ | '(' pattern ')' compound_list DSEMI linebreak
+ ;
+pattern : WORD /* Apply rule 4 */
+ | pattern '|' WORD /* Do not apply rule 4 */
+ ;
+if_clause : If compound_list Then compound_list else_part Fi
+ | If compound_list Then compound_list Fi
+ ;
+else_part : Elif compound_list Then else_part
+ | Else compound_list
+ ;
+while_clause : While compound_list do_group
+ ;
+until_clause : Until compound_list do_group
+ ;
+function_definition : fname '(' ')' linebreak function_body
+ ;
+function_body : compound_command /* Apply rule 9 */
+ | compound_command redirect_list /* Apply rule 9 */
+ ;
+fname : NAME /* Apply rule 8 */
+ ;
+brace_group : Lbrace compound_list Rbrace
+ ;
+do_group : Do compound_list Done /* Apply rule 6 */
+ ;
+simple_command : cmd_prefix cmd_word cmd_suffix
+ | cmd_prefix cmd_word
+ | cmd_prefix
+ | cmd_name cmd_suffix { $$ = { "simple-command" : [$1, $2] }; }
+ | cmd_name { console.log("simple_cmmand/cmd_name = ", JSON.stringify($1));
+ $$ = { "simple-command" : [$1] } ; }
+ ;
+cmd_name : WORD /* Apply rule 7a */
+ ;
+cmd_word : WORD /* Apply rule 7b */
+ ;
+cmd_prefix : io_redirect
+ | cmd_prefix io_redirect
+ | ASSIGNMENT_WORD
+ | cmd_prefix ASSIGNMENT_WORD
+ ;
+cmd_suffix : io_redirect { $$ = [ { "redirect" : $1 } ] ; }
+ | cmd_suffix io_redirect { var tmp = $1;
+ tmp.push( { "redirect" : $2 } );
+ $$ = tmp ; }
+ | WORD { $$ = [ { "param" : $1 } ] ; }
+ | cmd_suffix WORD { var tmp = $1;
+ tmp.push( { "param" : $2 } );
+ $$ = tmp ; }
+ ;
+redirect_list : io_redirect
+ | redirect_list io_redirect
+ ;
+io_redirect : io_file
+ | IO_NUMBER io_file
+ | io_here
+ | IO_NUMBER io_here
+ ;
+io_file : '<' filename { $$ = { "type" : $1, "dest" : $2 }; }
+ | LESSAND filename { $$ = { "type" : $1, "dest" : $2 }; }
+ | '>' filename { $$ = { "type" : $1, "dest" : $2 }; }
+ | GREATAND filename { $$ = { "type" : $1, "dest" : $2 }; }
+ | DGREAT filename { $$ = { "type" : $1, "dest" : $2 }; }
+ | LESSGREAT filename { $$ = { "type" : $1, "dest" : $2 }; }
+ | CLOBBER filename { $$ = { "type" : $1, "dest" : $2 }; }
+
+ ;
+filename : WORD /* Apply rule 2 */
+ ;
+io_here : DLESS here_end
+ | DLESSDASH here_end
+ ;
+here_end : WORD /* Apply rule 3 */
+ ;
+newline_list : NEWLINE
+ | newline_list NEWLINE
+ ;
+linebreak : newline_list
+ | /* empty */
+ ;
+separator_op : '&'
+ | ';'
+ ;
+separator : separator_op linebreak
+ | newline_list
+ ;
+sequential_sep : ';' linebreak
+ | newline_list
+ ;
+
+
+// vim: set shiftwidth=4:
+// vim: set tabstop=4:
+// vim: set expandtab:
diff --git a/shell-jison/shell-parser.orig.y b/shell-jison/shell-parser.orig.y
new file mode 100644
index 0000000..aa6c3d2
--- /dev/null
+++ b/shell-jison/shell-parser.orig.y
@@ -0,0 +1,220 @@
+/* The POSIX Shell Grammar, copied from:
+
+The Open Group Base Specifications Issue 6
+IEEE Std 1003.1, 2004 Edition
+Copyright © 2001-2004 The IEEE and The Open Group, All Rights reserved.
+ 2. Shell Command Language
+ 2.3 Token Recognition
+
+http://pubs.opengroup.org/onlinepubs/009604599/utilities/xcu_chap02.html#tag_02_03
+*/
+
+/* -------------------------------------------------------
+ The grammar symbols
+ ------------------------------------------------------- */
+
+
+%token WORD
+%token ASSIGNMENT_WORD
+%token NAME
+%token NEWLINE
+%token IO_NUMBER
+
+
+/* The following are the operators mentioned above. */
+
+
+%token AND_IF OR_IF DSEMI
+/* '&&' '||' ';;' */
+
+
+%token DLESS DGREAT LESSAND GREATAND LESSGREAT DLESSDASH
+/* '<<' '>>' '<&' '>&' '<>' '<<-' */
+
+
+%token CLOBBER
+/* '>|' */
+
+
+/* The following are the reserved words. */
+
+
+%token If Then Else Elif Fi Do Done
+/* 'if' 'then' 'else' 'elif' 'fi' 'do' 'done' */
+
+
+%token Case Esac While Until For
+/* 'case' 'esac' 'while' 'until' 'for' */
+
+
+/* These are reserved words, not operator tokens, and are
+ recognized when reserved words are recognized. */
+
+
+%token Lbrace Rbrace Bang
+/* '{' '}' '!' */
+
+
+%token In
+/* 'in' */
+
+
+/* -------------------------------------------------------
+ The Grammar
+ ------------------------------------------------------- */
+
+
+%start complete_command
+%%
+complete_command : list separator
+ | list
+ ;
+list : list separator_op and_or
+ | and_or
+ ;
+and_or : pipeline
+ | and_or AND_IF linebreak pipeline
+ | and_or OR_IF linebreak pipeline
+ ;
+pipeline : pipe_sequence
+ | Bang pipe_sequence
+ ;
+pipe_sequence : command
+ | pipe_sequence '|' linebreak command
+ ;
+command : simple_command
+ | compound_command
+ | compound_command redirect_list
+ | function_definition
+ ;
+compound_command : brace_group
+ | subshell
+ | for_clause
+ | case_clause
+ | if_clause
+ | while_clause
+ | until_clause
+ ;
+subshell : '(' compound_list ')'
+ ;
+compound_list : term
+ | newline_list term
+ | term separator
+ | newline_list term separator
+ ;
+term : term separator and_or
+ | and_or
+ ;
+for_clause : For name linebreak do_group
+ | For name linebreak in sequential_sep do_group
+ | For name linebreak in wordlist sequential_sep do_group
+ ;
+name : NAME /* Apply rule 5 */
+ ;
+in : In /* Apply rule 6 */
+ ;
+wordlist : wordlist WORD
+ | WORD
+ ;
+case_clause : Case WORD linebreak in linebreak case_list Esac
+ | Case WORD linebreak in linebreak case_list_ns Esac
+ | Case WORD linebreak in linebreak Esac
+ ;
+case_list_ns : case_list case_item_ns
+ | case_item_ns
+ ;
+case_list : case_list case_item
+ | case_item
+ ;
+case_item_ns : pattern ')' linebreak
+ | pattern ')' compound_list linebreak
+ | '(' pattern ')' linebreak
+ | '(' pattern ')' compound_list linebreak
+ ;
+case_item : pattern ')' linebreak DSEMI linebreak
+ | pattern ')' compound_list DSEMI linebreak
+ | '(' pattern ')' linebreak DSEMI linebreak
+ | '(' pattern ')' compound_list DSEMI linebreak
+ ;
+pattern : WORD /* Apply rule 4 */
+ | pattern '|' WORD /* Do not apply rule 4 */
+ ;
+if_clause : If compound_list Then compound_list else_part Fi
+ | If compound_list Then compound_list Fi
+ ;
+else_part : Elif compound_list Then else_part
+ | Else compound_list
+ ;
+while_clause : While compound_list do_group
+ ;
+until_clause : Until compound_list do_group
+ ;
+function_definition : fname '(' ')' linebreak function_body
+ ;
+function_body : compound_command /* Apply rule 9 */
+ | compound_command redirect_list /* Apply rule 9 */
+ ;
+fname : NAME /* Apply rule 8 */
+ ;
+brace_group : Lbrace compound_list Rbrace
+ ;
+do_group : Do compound_list Done /* Apply rule 6 */
+ ;
+simple_command : cmd_prefix cmd_word cmd_suffix
+ | cmd_prefix cmd_word
+ | cmd_prefix
+ | cmd_name cmd_suffix
+ | cmd_name
+ ;
+cmd_name : WORD /* Apply rule 7a */
+ ;
+cmd_word : WORD /* Apply rule 7b */
+ ;
+cmd_prefix : io_redirect
+ | cmd_prefix io_redirect
+ | ASSIGNMENT_WORD
+ | cmd_prefix ASSIGNMENT_WORD
+ ;
+cmd_suffix : io_redirect
+ | cmd_suffix io_redirect
+ | WORD
+ | cmd_suffix WORD
+ ;
+redirect_list : io_redirect
+ | redirect_list io_redirect
+ ;
+io_redirect : io_file
+ | IO_NUMBER io_file
+ | io_here
+ | IO_NUMBER io_here
+ ;
+io_file : '<' filename
+ | LESSAND filename
+ | '>' filename
+ | GREATAND filename
+ | DGREAT filename
+ | LESSGREAT filename
+ | CLOBBER filename
+ ;
+filename : WORD /* Apply rule 2 */
+ ;
+io_here : DLESS here_end
+ | DLESSDASH here_end
+ ;
+here_end : WORD /* Apply rule 3 */
+ ;
+newline_list : NEWLINE
+ | newline_list NEWLINE
+ ;
+linebreak : newline_list
+ | /* empty */
+ ;
+separator_op : '&'
+ | ';'
+ ;
+separator : separator_op linebreak
+ | newline_list
+ ;
+sequential_sep : ';' linebreak
+ | newline_list
+ ;