[antlr-interest] Help with pesky Lexer determinism

Mark Bednarczyk voytechs at yahoo.com
Fri Jun 10 09:44:50 PDT 2005


While I'm on a roll, I added the Ethernet address matching code
in forms:

XX:XX:XX:XX:XX:XX
dec:dec:dec:dec:dec
XX-XX-XX-XX-XX-XX
dec-dec-dec-dec-dec-dec


And the code if anyone is interested:



/* Disable some rules inherited from java.g */
protected COLON :;
protected IDENT :;

protected NUM_3DIGIT: ('0'..'9') (('0'..'9') ('0'..'9')?)?
    ;

protected NUM_HEX_4DIGIT: HEX_DIGIT ((HEX_DIGIT) ((HEX_DIGIT)
(HEX_DIGIT)?)?)?

NUM_INT
options {
    testLiterals = true;
}
    {boolean isDecimal=false; Token t=null; }

    // IPv4 RULE
    :   (NUM_3DIGIT '.' NUM_3DIGIT '.')=>
        (
            NUM_3DIGIT '.' NUM_3DIGIT '.' NUM_3DIGIT '.'
NUM_3DIGIT
            { $setType(IP_V4); }
        )

        //
        // MAC ADRESS RULE - exactly 6 COLON/DASH separated ints
        //
    |   (NUM_HEX_4DIGIT (':'|'-') NUM_HEX_4DIGIT (':'|'-')
         NUM_HEX_4DIGIT (':'|'-') NUM_HEX_4DIGIT (':'|'-')
         NUM_HEX_4DIGIT (':'|'-') NUM_HEX_4DIGIT ~(':'))=>
        (
            NUM_HEX_4DIGIT (':'|'-') NUM_HEX_4DIGIT (':'|'-')
            NUM_HEX_4DIGIT (':'|'-') NUM_HEX_4DIGIT (':'|'-')
            NUM_HEX_4DIGIT (':'|'-') NUM_HEX_4DIGIT
        ) { $setType(MAC_ADDRESS); }

    // IPv6 RULE
    |   (NUM_HEX_4DIGIT ':')=>
        (

            ((NUM_HEX_4DIGIT ':')+ ':')=>
            (
                (NUM_HEX_4DIGIT ':')+ ':'
                (NUM_HEX_4DIGIT (':' NUM_HEX_4DIGIT)*)?
            )   { $setType(IP_V6); }

            |   NUM_HEX_4DIGIT (':' NUM_HEX_4DIGIT)+
                { $setType(IP_V6); }

        )   { $setType(IP_V6); }

    |   (':' ':' NUM_HEX_4DIGIT)=>
        ':' ':' NUM_HEX_4DIGIT (':' NUM_HEX_4DIGIT)*
        { $setType(IP_V6); }

    |   ':' ':'
        { $setType(IP_V6); }

    |   ':'
        { $setType(COLON); }


    // IDENT rule
    |   ('a'..'z'|'A'..'Z'|'_'|'$')
('a'..'z'|'A'..'Z'|'_'|'0'..'9'|'$')*
            { $setType(IDENT); }
    // Number beginning with '.' rule
    |   '.' { $setType(DOT);}
            (   ('0'..'9')+ (EXPONENT)? (f1:FLOAT_SUFFIX
{t=f1;})?
                {
                if (t != null &&
t.getText().toUpperCase().indexOf('F')>=0) {
                    _ttype = NUM_FLOAT;
                }
                else {
                    _ttype = NUM_DOUBLE; // assume double
                }
                }
            )?



    // Number beginning with a 0 rule
    |   (   '0' {isDecimal = true;} // special case for just '0'
            (   ('x'|'X')
                (                                           //
hex
                    // the 'e'|'E' and float suffix stuff look
                    // like hex digits, hence the (...)+ doesn't
                    // know when to stop: ambig.  ANTLR resolves
                    // it correctly by matching immediately.  It
                    // is therefor ok to hush warning.
                    options {
                        warnWhenFollowAmbig=false;
                    }
                :   HEX_DIGIT
                )+

            |   //float or double with leading zero
                (('0'..'9')+ ('.'|EXPONENT|FLOAT_SUFFIX)) =>
('0'..'9')+

            |   ('0'..'7')+                                 //
octal
            )?

        // A regular number non-zero starting rule
        |   ('1'..'9') ('0'..'9')*  {isDecimal=true;}       //
non-zero decimal
        )
        (   ('l'|'L') { _ttype = NUM_LONG; }

        // only check to see if it's a float if looks like
decimal so far
        |   {isDecimal}?
            (   '.' ('0'..'9')* (EXPONENT)? (f2:FLOAT_SUFFIX
{t=f2;})?
            |   EXPONENT (f3:FLOAT_SUFFIX {t=f3;})?
            |   f4:FLOAT_SUFFIX {t=f4;}
            )
            {
            if (t != null && t.getText().toUpperCase()
.indexOf('F') >= 0) {
                _ttype = NUM_FLOAT;
            }
            else {
                _ttype = NUM_DOUBLE; // assume double
            }
            }
        )?
    ;




More information about the antlr-interest mailing list