[antlr-interest] Actionscript (or at least javascript) Support ??

Sat May 24 05:26:56 PDT 2003

robifofo wrote:
> Hi all
> 
> I want to create a compiler for Actionscript (a javascript-like
> language which is used inside the Macromedia flash player).
> 
> Has anybody done something in this direction ?
> Or does at least exist somewhere a grammar file for javascript, which
> is similar to actioscript ?? (I haven't found one in the distribution,
> but I am new to antlr and thisis my first post to this list, so maybe
> I have missed something...)
> 
> any help highly appreciated
> 
> Roberto Saccon
> 
> 
>  
> 
> Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/ 
> 
> 
I have written one which is actually a modified version of java grammar.
The reason for writing it is parsing asp files (html, vb script, js 
script) so some modifications will be needed in order to use it.

This is not the latest version of the grammar.
On tusday I can sent the latest version.

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/ 

-------------- next part --------------
header {
  package gr.omadak.leviathan.asp;
}

class JsParser extends Parser;
options {
  k = 2;
  exportVocab=Js;
  codeGenMakeSwitchThreshold = 2;
  codeGenBitsetTestThreshold = 3;
  defaultErrorHandler = false;
  buildAST = true;
}

tokens {
    ARRAY;BOOLEAN; DATE; NAN; NUMBER; OBJECT; STRING; BREAK;
    CASE; CATCH; CONTINUE; DEFAULT; DELETE; ELSE; FALSE;
    FINALLY; FOR; FUNCTION; IF; IN; INSTANCEOF; NEW; NULL;
    RETURN; SWITCH; THIS; THROW; TRUE; TRY; TYPEOF;
    UNDEFINED; VAR; VOID; WHILE; WITH;

    SLIST; ARRAY_DECLARATOR; INDEX_OP; POST_INC; POST_DEC; METHOD_CALL;
    EXPR; UNARY_MINUS; UNARY_PLUS; CASE_GROUP; ELIST; FOR_INIT;
    FOR_CONDITION;FOR_ITERATOR; EMPTY_STAT;LABELED_STAT;

    HTML;JS_END;EQ_HTML;
}

start_rule
  :
  (
    statement
  )*
  EOF!
  ;

identifier
  : 
  IDENTIFIER  ( DOT^ IDENTIFIER )*
  ;

compoundStatement
  : lc:LCURLY^ {#lc.setType(SLIST);}
      // include the (possibly-empty) list of statements
      (statement)*
    RCURLY!
  ;

declaration
  :
  VAR IDENTIFIER (ASSIGN expression)?
  (COMMA! IDENTIFIER (ASSIGN expression)?)*
  ;

statement
  :
  compoundStatement
  | FUNCTION^ IDENTIFIER LPAREN! argList RPAREN! statement
  | declaration SEMI!
  | expression SEMI!
  | IDENTIFIER c:COLON^ {#c.setType(LABELED_STAT);} statement
  | if_rule
  | WITH LPAREN! expression RPAREN statement
  | FOR^
      LPAREN!
      (
      (expression IN)=>
          in_statement
          |
          forInit SEMI!   // initializer
          forCond SEMI!   // condition test
          forIter         // updater
      )
      RPAREN!
      statement
  | WHILE^ LPAREN! expression RPAREN! statement
  | DO^ statement WHILE! LPAREN! expression RPAREN! SEMI!
  | BREAK^ (IDENTIFIER)? SEMI!
  | CONTINUE^ (IDENTIFIER)? SEMI!
  | RETURN^ (expression)? SEMI!
  | SWITCH^ LPAREN! expression RPAREN! LCURLY!
      ( casesGroup )*
    RCURLY!
  | tryBlock
  | THROW^ expression SEMI!
  | s:SEMI {#s.setType(EMPTY_STAT);}
  | HTML
  | EQ_HTML^ expression SEMI!
  ;

if_rule
  :
  IF^ LPAREN! expression RPAREN! statement
  (
      (ELSE)=> ELSE! statement
      |
  )
  ;

in_statement
  :
  expression IN^ expression
  ;

casesGroup
  : ( // CONFLICT: to which case group do the statements bind?
      //           ANTLR generates proper code: it groups the
      //           many "case"/"default" labels together then
      //           follows them with the statements
      options {
        warnWhenFollowAmbig = false;
      }
      :
      aCase
    )+
    caseSList
    {#casesGroup = #([CASE_GROUP, "CASE_GROUP"], #casesGroup);}
  ;

aCase
  : (CASE^ expression | DEFAULT) COLON!
  ;

caseSList
  : (statement)*
    {#caseSList = #(#[SLIST,"SLIST"],#caseSList);}
  ;

// The initializer for a for loop
forInit
    // if it looks like a declaration, it is
  : ( declaration
    // otherwise it could be an expression list...
    | expressionList
    )?
    {#forInit = #(#[FOR_INIT,"FOR_INIT"],#forInit);}
  ;

forCond
  : (expression)?
    {#forCond = #(#[FOR_CONDITION,"FOR_CONDITION"],#forCond);}
  ;

forIter
  : (expressionList)?
    {#forIter = #(#[FOR_ITERATOR,"FOR_ITERATOR"],#forIter);}
  ;

// an exception handler try/catch block
tryBlock
  : TRY^ compoundStatement
    (handler)*
    ( FINALLY^ compoundStatement )?
  ;

// an exception handler
handler
  : CATCH^ LPAREN! IDENTIFIER RPAREN! compoundStatement
  ;

expression
  : assignmentExpression
    {#expression = #(#[EXPR,"EXPR"],#expression);}
  ;

// This is a list of expressions.
expressionList
  : expression (COMMA! expression)*
    {#expressionList = #(#[ELIST,"ELIST"], expressionList);}
  ;

// assignment expression (level 13)
assignmentExpression
  : conditionalExpression
    ( ( ASSIGN^
            |   PLUS_ASSIGN^
            |   MINUS_ASSIGN^
            |   STAR_ASSIGN^
            |   DIV_ASSIGN^
            |   MOD_ASSIGN^
            |   SR_ASSIGN^
            |   BSR_ASSIGN^
            |   SL_ASSIGN^
            |   BAND_ASSIGN^
            |   BXOR_ASSIGN^
            |   BOR_ASSIGN^
            )
      assignmentExpression
    )?
  ;

// conditional test (level 12)
conditionalExpression
  : logicalOrExpression
    ( QUESTION^ assignmentExpression COLON! conditionalExpression )?
  ;

// logical or (||)  (level 11)
logicalOrExpression
  : logicalAndExpression (LOR^ logicalAndExpression)*
  ;

// logical and (&&)  (level 10)
logicalAndExpression
  : inclusiveOrExpression (LAND^ inclusiveOrExpression)*
  ;

// bitwise or non-short-circuiting or (|)  (level 9)
inclusiveOrExpression
  : exclusiveOrExpression (BOR^ exclusiveOrExpression)*
  ;

// exclusive or (^)  (level 8)
exclusiveOrExpression
  : andExpression (BXOR^ andExpression)*
  ;

// bitwise or non-short-circuiting and (&)  (level 7)
andExpression
  : equalityExpression (BAND^ equalityExpression)*
  ;

// equality/inequality (==/!=) (level 6)
equalityExpression
  : relationalExpression ((NOT_EQUAL^ | EQUAL^) relationalExpression)*
  ;

// boolean relational expressions (level 5)
relationalExpression
  : shiftExpression
    ( ( ( LT^
        | GT^
        | LE^
        | GE^
        )
        shiftExpression
      )*
    | INSTANCEOF^ type
    )
  ;

// bit shift expressions (level 4)
shiftExpression
  : additiveExpression ((SL^ | SR^ | BSR^) additiveExpression)*
  ;

// binary addition/subtraction (level 3)
additiveExpression
  : multiplicativeExpression ((PLUS^ | MINUS^) multiplicativeExpression)*
  ;

// multiplication/division/modulo (level 2)
multiplicativeExpression
  : unaryExpression ((STAR^ | DIV^ | MOD^ ) unaryExpression)*
  ;

unaryExpression
  : INC^ unaryExpression
  | DEC^ unaryExpression
  | MINUS^ {#MINUS.setType(UNARY_MINUS);} unaryExpression
  | PLUS^  {#PLUS.setType(UNARY_PLUS);} unaryExpression
  | unaryExpressionNotPlusMinus
  ;

unaryExpressionNotPlusMinus
  : BNOT^ unaryExpression
  | LNOT^ unaryExpression
  | postfixExpression
  ;

// qualified names, array expressions, method invocation, post inc/dec
postfixExpression
  :
  arrayInitilizer
  | (
  primaryExpression // start with a primary

    (
    // qualified id (id.id.id.id...) -- build the name
      DOT^ (
          IDENTIFIER
        | newExpression
        )
      // the above line needs a semantic check to make sure "class"
      //   is the _last_ qualifier.

      // an array indexing operation
    | lb:LBRACK^ {#lb.setType(INDEX_OP);} expression RBRACK!
    | lp:LPAREN^ {#lp.setType(METHOD_CALL);}
        argList
      RPAREN!
    )*

    // possibly add on a post-increment or post-decrement.
    // allows INC/DEC on too much, but semantics can check
    ( in:INC^ {#in.setType(POST_INC);}
    | de:DEC^ {#de.setType(POST_DEC);}
    | // nothing
    )
   )
  ;

arrayInitilizer
  :
  LBRACK (expression (COMMA expression)*)? RBRACK
  ;

// the basic element of an expression
primaryExpression
  : IDENTIFIER
  | newExpression
  | constant
  | TRUE
  | FALSE
  | NULL
  | UNDEFINED
  | LPAREN! assignmentExpression RPAREN!
  ;

type
  :
  ARRAY | DATE | OBJECT | STRING | NUMBER | BOOLEAN
  ;

newExpression
  : NEW^ type LPAREN! argList RPAREN!
  ;

argList
  : ( expressionList
    | /*nothing*/
      {#argList = #[ELIST,"ELIST"];}
    )
  ;

newArrayDeclarator
  : (
      // CONFLICT:
      // newExpression is a primaryExpression which can be
      // followed by an array index reference.  This is ok,
      // as the generated code will stay in this loop as
      // long as it sees an LBRACK (proper behavior)
      options {
        warnWhenFollowAmbig = false;
      }
    :
      lb:LBRACK^ {#lb.setType(ARRAY_DECLARATOR);}
        (expression)?
      RBRACK!
    )+
  ;

constant
  : NUM_INT
  | STRING_LITERAL
  | NUM_FLOAT
  ;

//----------------------------------------------------------------------------
// The JavaScript scanner
//----------------------------------------------------------------------------
class JsLexer extends Lexer;

options {
  exportVocab=Js;      // call the vocabulary "Java"
  testLiterals=false;    // don't automatically test for literals
  k=4;                   // four characters of lookahead
  filter=false;
}

{
    private int lastToken = -1;
}

// OPERATORS
QUESTION    : '?'   ;
LPAREN      : '('   ;
RPAREN      : ')'   ;
LBRACK      : '['   ;
RBRACK      : ']'   ;
LCURLY      : '{'   ;
RCURLY      : '}'   ;
COLON     : ':'   ;
COMMA     : ','   ;
//DOT     : '.'   ;
ASSIGN      : '='   ;
EQUAL     : "=="  ;
LNOT      : '!'   ;
BNOT      : '~'   ;
NOT_EQUAL   : "!="  ;
DIV       : '/'   ;
DIV_ASSIGN    : "/="  ;
PLUS      : '+'   ;
PLUS_ASSIGN   : "+="  ;
INC       : "++"  ;
MINUS     : '-'   ;
MINUS_ASSIGN  : "-="  ;
DEC       : "--"  ;
STAR      : '*'   ;
STAR_ASSIGN   : "*="  ;
MOD       : '%'   ;
MOD_ASSIGN    : "%="  ;
SR        : ">>"  ;
SR_ASSIGN   : ">>=" ;
BSR       : ">>>" ;
BSR_ASSIGN    : ">>>="  ;
GE        : ">="  ;
GT        : ">"   ;
SL        : "<<"  ;
SL_ASSIGN   : "<<=" ;
LE        : "<="  ;
LT        : '<'   ;
BXOR      : '^'   ;
BXOR_ASSIGN   : "^="  ;
BOR       : '|'   ;
BOR_ASSIGN    : "|="  ;
LOR       : "||"  ;
BAND      : '&'   ;
BAND_ASSIGN   : "&="  ;
LAND      : "&&"  ;
SEMI      : ';'   ;

IDENTIFIER_TYPES
  :
  id:IDENTIFIER {
      /*
        Since ASP uses ActiveX components
        and an ActiveX can have a method or attribute with
        name equal to some keyword, be sure that the last token
        wasn't DOT
      */
      if (lastToken != DOT) {
          //System.out.println(getLine() + ":" + getColumn() + " Wasn't DOT");
          String idValue = id.getText();
          Object intVal = JsConstants.KEYWORDS.get(idValue);
          if (intVal != null) {
              _ttype = ((Integer) intVal).intValue();
          } else {
              _ttype = IDENTIFIER;
          }
      } else {
              _ttype = IDENTIFIER;
      }
      lastToken = _ttype;
  }
  ;

// Whitespace -- ignored
WS  : ( ' '
    | '\t'
    | '\f'
    // handle newlines
    | ( "\r\n"  // Evil DOS
      | '\r'    // Macintosh
      | '\n'    // Unix (the right way)
      )
      { newline(); }
    )
    { _ttype = Token.SKIP; }
  ;

// Single-line comments
SL_COMMENT
  : "//"
    (~('\n'|'\r'))* ('\n'|'\r'('\n')?)
    {$setType(Token.SKIP); newline();}
  ;

// multiple-line comments
ML_COMMENT
  : "/*"
    ( /*  '\r' '\n' can be matched in one alternative or by matching
        '\r' in one iteration and '\n' in another.  I am trying to
        handle any flavor of newline that comes in, but the language
        that allows both "\r\n" and "\r" and "\n" to all be valid
        newline is ambiguous.  Consequently, the resulting grammar
        must be ambiguous.  I'm shutting this warning off.
       */
      options {
        generateAmbigWarnings=false;
      }
    :
      { LA(2)!='/' }? '*'
    | '\r' '\n'   {newline();}
    | '\r'      {newline();}
    | '\n'      {newline();}
    | ~('*'|'\n'|'\r')
    )*
    "*/"
    {$setType(Token.SKIP);}
  ;

// string literals
STRING_LITERAL
  : '"' (ESC|~('"'|'\\'))* '"'
  | '\'' ( ESC | ~'\'' )* '\''
  ;

// escape sequence -- note that this is protected; it can only be called
//   from another lexer rule -- it will not ever directly return a token to
//   the parser
// There are various ambiguities hushed in this rule.  The optional
// '0'...'9' digit matches should be matched here rather than letting
// them go back to STRING_LITERAL to be matched.  ANTLR does the
// right thing by matching immediately; hence, it's ok to shut off
// the FOLLOW ambig warnings.
protected
ESC
  : '\\'
    ( 'n'
    | 'r'
    | 't'
    | 'b'
    | 'f'
    | '"'
    | '\''
    | '\\'
    | ('u')+ HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
    | ('0'..'3')
      (
        options {
          warnWhenFollowAmbig = false;
        }
      : ('0'..'7')
        (
          options {
            warnWhenFollowAmbig = false;
          }
        : '0'..'7'
        )?
      )?
    | ('4'..'7')
      (
        options {
          warnWhenFollowAmbig = false;
        }
      : ('0'..'9')
      )?
    )
  ;

// hexadecimal digit (again, note it's protected!)
protected
HEX_DIGIT
  : ('0'..'9'|'A'..'F'|'a'..'f')
  ;

ASPEND
  :
  "%>" { _ttype = HtmlLexerUtil.ASP_END;}
  ;

DUMMY
  :
  "</" s:WORD
  {
      if (!"script".equalsIgnoreCase(s.getText())) {
          throw new RecognitionException("Expected \"script\" at line:"
          + getLine() + " at column:" + getColumn());
      }
   }
  '>'
  { _ttype = HtmlLexerUtil.JS_END;}
  ;

LANGUAGE
  {String lang = null;}
  :
  '@' w:WORD (' ')*
  {
      if (!"language".equalsIgnoreCase(w.getText())) {
          throw new RecognitionException("Expected \"language\" at line:"
          + getLine() + " at column:" + getColumn());
      }
   }
   '=' (' ')*
   (
      l:STRING_LITERAL {lang = l.getText();}
      | i:IDENTIFIER   {lang = i.getText();}
   ) (' ')*   
   {
      $setType(HtmlLexerUtil.LANGUAGE);
      $setText(lang);
   }
   ;

protected WORD
  :
  ('a'..'z' | 'A'..'Z')+
  ;

// a dummy rule to force vocabulary to be all characters (except special
//   ones that ANTLR uses internally (0 to 2)
protected
VOCAB
  : '\3'..'\377'
  ;

// an identifier.  Note that testLiterals is set to true!  This means
// that after we match the rule, we look in the literals table to see
// if it's a literal or really an identifer
protected IDENTIFIER
  : ('a'..'z'|'A'..'Z'|'_'|'$') ('a'..'z'|'A'..'Z'|'_'|'0'..'9'|'$')*
  ;

// a numeric literal
NUM_INT
  {boolean isDecimal=false;}
  :
  (
  '.' {_ttype = DOT;}
      (('0'..'9')+ (EXPONENT)? (FLOAT_SUFFIX)? { _ttype = NUM_FLOAT; })?
  | ( '0' {isDecimal = true;} // special case for just '0'
      ( ('x'|'X')
        (                     // hex
          // the 'e'|'E' and float suffix stuff look
          // like hex digits, hence the (...)+ doesn't
          // know when to stop: ambig.  ANTLR resolves
          // it correctly by matching immediately.  It
          // is therefor ok to hush warning.
          options {
            warnWhenFollowAmbig=false;
          }
        : HEX_DIGIT
        )+
      | ('0'..'7')+                 // octal
      )?
    | ('1'..'9') ('0'..'9')*  {isDecimal=true;}   // non-zero decimal
    )
    ( ('l'|'L')

    // only check to see if it's a float if looks like decimal so far
    | {isDecimal}?
      ( '.' ('0'..'9')* (EXPONENT)? (FLOAT_SUFFIX)?
      | EXPONENT (FLOAT_SUFFIX)?
      | FLOAT_SUFFIX
      )
      { _ttype = NUM_FLOAT; }
    )?
  )
  {
      lastToken = _ttype;
  }

  ;

// a couple protected methods to assist in matching floating point numbers
protected
EXPONENT
  : ('e'|'E') ('+'|'-')? ('0'..'9')+
  ;

protected
FLOAT_SUFFIX
  : 'f'|'F'|'d'|'D'
  ;

-------------- next part --------------
A non-text attachment was scrubbed...
Name: JsConstants.java
Type: java/*
Size: 2186 bytes
Desc: not available
Url : http://www.antlr.org/pipermail/antlr-interest/attachments/20030524/1dd98f81/JsConstants.bin