[antlr-interest] Resolving ambiguity with dates and division operations

Matthew Ford Matthew.Ford at forward.com.au
Tue Apr 15 17:46:12 PDT 2003


I did this in the Lexer

options {
 charVocabulary = '\u0000'..'\u00FE';  // allow all possiable unicodes
except -1 == EOF
 testLiterals=false;    // don't automatically test for literals
 k=3;                   // four characters of lookahead
 caseSensitiveLiterals=false;
 caseSensitive=false;
 defaultErrorHandler = false;   // pass error back to parser
}

// OPERATORS
LPAREN    options { paraphrase = "(";}
   : '('  ;
RPAREN    options { paraphrase = ")";}
   : ')'  ;
LBRACK    options { paraphrase = "[";}
   : '['  ;
RBRACK    options { paraphrase = "]";}
   : ']'  ;
LCURLY    options { paraphrase = "{";}
   : '{'  ;
RCURLY    options { paraphrase = "}";}
   : '}'  ;
COLON     options { paraphrase = ":";}
   : ':'  ;
COMMA     options { paraphrase = ",";}
   : ','  ;
DOT     options { paraphrase = ".";}
   : '.'  ;

ASSIGN
 options { paraphrase = "=";}
   : '='  ;

EQUAL    options { paraphrase = "==";}
   : "==" ;
NOT_EQUAL1   options { paraphrase = "/=";}
   : "/=";
NOT_EQUAL2   options { paraphrase = "!=";}
   : "!=";
NOT_EQUAL3   options { paraphrase = "<>";}
   : "<>";
STR_EQUAL    options { paraphrase = "$==";}
   : "$==" ;
STR_NOT_EQUAL1   options { paraphrase = "$/=";}
   : "$/=" ;
STR_NOT_EQUAL2   options { paraphrase = "$!=";}
   : "$!=" ;
STR_NOT_EQUAL3   options { paraphrase = "$<>";}
   : "$<>";

PLUS    options { paraphrase = "+";}
   : '+'  ;
MINUS    options { paraphrase = "-";}
   : '-'  ;
STR_GE     options { paraphrase = "$>=";}
   : "$>=" ;
STR_GT     options { paraphrase = "$>";}
   : "$>"  ;
STR_LE     options { paraphrase = "$<=";}
   : "$<=" ;
STR_LT     options { paraphrase = "$<";}
   : "$<"  ;

GE     options { paraphrase = ">=";}
   : ">=" ;
GT     options { paraphrase = ">";}
   : '>'  ;
LE     options { paraphrase = "<=";}
   : "<=" ;
LT     options { paraphrase = "<";}
   : '<'  ;
DIV_LEFT  options { paraphrase = "/";}
   : '/'  ;
DIV_RIGHT   options { paraphrase = "\\";}
   : '\\'  ;
SEMI    options { paraphrase = ";";}
   : ';'  ;
QUESTIONMARK options { paraphrase = "?";}
   : '?'  ;


// Whitespace -- ignored
TAB_FORMFEED
  : ( '\t'
  | '\f'
  )
  {  _ttype = Token.SKIP; }
 ;

// Whitespace -- ignored
SPACE
  : ' '
  {  _ttype = Token.SKIP; }
 ;


NEWLINE
  : ( /* '\r' '\n' can be matched in one alternative or by matching
    '\r' in one iteration and '\n' in another.  I am trying to
    handle any flavor of newline that comes in, but the language
    that allows both "\r\n" and "\r" and "\n" to all be valid
    newline is ambiguous.  Consequently, the resulting grammar
    must be ambiguous.  I'm shutting this warning off.
    */
    options {
    generateAmbigWarnings=false;
   }
   : '\r' '\n'   // DOS
   | '\r'  // Macintosh
   | '\n' // Unix
   )  {newline();}
  { if (skipNL) {   // skip NL is skipNEWLINE() called
    _ttype = Token.SKIP;
   }
  }
;

// white space is skipped by the parser
protected
WS_SET : ( ' '
  | '\t' | '\f'
  | NEWLINE
  | ML_COMMENT
  | SL_COMMENT
  )+
  {$setType(Token.SKIP);}  // way to set token type
 ;

// Single-line comments
SL_COMMENT
 : "//"
  (~('\n'|'\r'))* NEWLINE
  {$setType(Token.SKIP);}
 ;

// multiple-line comments these are also skipped
ML_COMMENT
 : "/*"
  ( // suppress warnings about * /
   options {
    greedy = true;
   }
  : { LA(2)!='/' }? '*'
  | ~('*'|'\n'|'\r')
  | NEWLINE
  )*
  "*/"
  {$setType(Token.SKIP);}
 ;


// character literals SINGLE QUOTES around string
//CHAR_LITERAL
// : '\''! (ESC|~('\''|'\\'))* '\''!
// ;

// note must have WS between strings because "" is used for " inside string
STRING_LITERAL
  : SL_STRING_LITERAL //(WS_SET! SL_STRING_LITERAL)*
      // string concat does not work because it tries to concat anything
after
      // a string + WS_SET.
  ;

 // string literals DOUBLE QUOTES around string
 // "" => " inside double quotes.
 // can also use \"
protected
SL_STRING_LITERAL
{int i = 0;}
 : '"'! (ESC|~('"'|'\\'|'\n'))* ('"''"'! ((ESC|~('"'|'\\'|'\n')))*)*
  ('"'!
  |'\n'
   {
     if (i==0) {
      throw new TokenStreamRecognitionException(
     new RecognitionException("found newline inside string:'"+$getText+"'",
getFilename(), getLine()));
    }
   }
  )
 ;

// escape sequence -- note that this is protected; it can only be called
//   from another lexer rule -- it will not ever directly return a token to
//   the parser
// There are various ambiguities hushed in this rule.  The optional
// '0'...'9' digit matches should be matched here rather than letting
// them go back to STRING_LITERAL to be matched.  ANTLR does the
// right thing by matching immediately; hence, it's ok to shut off
// the FOLLOW ambig warnings.
protected
ESC
 : '\\'
  ( 'n' { $setText("\n");}
  | 'r' { $setText("\r");}
  | 't' { $setText("\t");}
  | 'b' { $setText("\b");}
  | 'f' { $setText("\f");}
  | '"' { $setText("\"");}
  | '\'' { $setText("\'");}
  | '\\' { $setText("\\");}
  | ('u')+ HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
  | ('0'..'3')
   (
    options {
     warnWhenFollowAmbig = false;
    }
   : ('0'..'9')
    (
     options {
      warnWhenFollowAmbig = false;
     }
    : '0'..'9'
    )?
   )?
  | ('4'..'7')
   (
    options {
     warnWhenFollowAmbig = false;
    }
   : ('0'..'9')
   )?
  )
 ;



// hexadecimal digit (again, note it's protected!)
protected
HEX
 : '0' 'x' (HEX_DIGIT)+
 ;

protected
HEX_DIGIT
 : ('0'..'9'|'a'..'f')
 ;


// an identifier.  Note that testLiterals is set to true for this rule!
This means
// that after we match the rule, we look in the literals table to see
// if it's a literal or really an identifer
IDENT
 options {testLiterals=true;
     paraphrase = "an identifier";}
// : ('a'..'z'|'_'|'$'|'\u0080'..'\uFFFE')
('a'..'z'|'_'|'0'..'9'|'$'|'\u0080'..'\uFFFE')*
 : ('a'..'z'|'_'|'$') ('a'..'z'|'_'|'0'..'9'|'$')*
 ;
// do not support summariZe
// {if ($getText().equalsIgnoreCase("summarize")) { _ttype =
SUMMARISE_STATEMENT;}}

protected
DIGIT
 : ('0'..'9')
 ;


// a numeric literal
protected
INT
 : (DIGIT)+
 ;

// signed int
//protected
//SIGNED_INT
// : ('+'|'-')?INT
// ;

// a numeric literal date, time, int, float, hex or oct
DATE_TIME_INT_FLOAT
 : (DIGIT DIGIT DIGIT DIGIT '/') => DATE {_ttype = DATE;}
 | (DIGIT (DIGIT)? ':') => TIME {_ttype = TIME;}
 | (INT '.') => FLOAT {_ttype = FLOAT;}
 | INT {_ttype = INT;}
 | ('0' 'x')=> HEX {_ttype = HEX;}
 ;

protected
FLOAT
 : INT '.' INT (EXPONENT)?
 ;

// signed float
//protected
//SIGNED_FLOAT
// : SIGNED_INT '.' INT (EXPONENT)?
// ;


protected
DATE
 : DIGIT DIGIT DIGIT DIGIT '/'
   DIGIT (DIGIT)? '/' DIGIT (DIGIT)?
 ;

protected
TIME
 : DIGIT (DIGIT)? ':'
   DIGIT (DIGIT)? ':'
   DIGIT (DIGIT)? ('.' INT)?
 ;

// need to add floating point
// a couple protected methods to assist in matching floating point numbers
protected
EXPONENT
 : ('e') ('+'|'-')? ('0'..'9')+
 ;


// should be in parser also duration
----- Original Message -----
From: "oneway_111" <oneway_111 at yahoo.com>
To: <antlr-interest at yahoogroups.com>
Sent: Wednesday, April 16, 2003 10:33 AM
Subject: [antlr-interest] Resolving ambiguity with dates and division
operations


> The problem is that the parser needs to be able parse formulas that
> could contain math operations and/or operations with dates. The dates
> are specified in the format "mm/dd/yyyy" or "mm/dd/yy", e.g.
> "04/15/2003".
>
> How would one go about writing rules to be able to distinguish dates
> from math expressions containing several division operations, i.e.
> 04/15/2003 should be April 15, 2003 and not 4 divided by 15 divided by
> 2003?
>
> Thanks
>
>
>
>
> Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/
>
>


 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/ 




More information about the antlr-interest mailing list