[antlr-interest] Return types on lexer rules do not seem to work.

Johannes Luber jaluber at gmx.de
Wed Jul 18 15:33:05 PDT 2007


Kay Roepke wrote:
> 
> Scopes do work in lexers. Apart from probably not being very useful, since
> lexer rules normally don't recurse that much, it's ok.
> 
> See this grammar:
> 
....
>     
> 
> Looking at the generated code, everthings cool. This won't enable you to
> pass stuff between
> the parser and the lexer, though.
> Don't ask me how this would behave in a combined grammar (I don't wanna
> know it anyway...;))
> 
> What's the reason for doing this? Is there a total requirement to pass
> stuff between lexical rules?

Not sure, what you mean with "total requirement", but I had to pass down
the Unicode character class and use it in a predicate. Due to hoisting
the usage of parameters was out. Possibly the hoisting will force me to
move the scope variable setting upwards, but I haven't checked this yet.
Below is the grammar, if you're interested.

Best regards,
Johannes Luber


fragment IDENTIFIER_OR_KEYWORD
	:	IDENTIFIER_START_CHARACTER IDENTIFIER_PART_CHARACTER*
	;

fragment IDENTIFIER_START_CHARACTER
options{backtrack=true; memoize=true;}
	:	LETTER_CHARACTER
	|	'_' | '\\u005F' | '\\U0000005F' // (the underscore character U+005F)
in every variant
	;
	
fragment IDENTIFIER_PART_CHARACTER
options{backtrack=true; memoize=true;}
	:	LETTER_CHARACTER
	|	DECIMAL_DIGIT_CHARACTER
	|	CONNECTING_CHARACTER
	|	COMBINING_CHARACTER
	|	FORMATTING_CHARACTER
	;
	
fragment LETTER_CHARACTER // A Unicode character of classes Lu, Ll, Lt,
Lm, Lo, or Nl (possibly encoded)
options{backtrack=true; memoize=true;}
scope UnicodeClassScope;
	:	UNICODE_CLASS_Lu
	|	UNICODE_CLASS_Ll
	|	UNICODE_CLASS_Lt
	|	UNICODE_CLASS_Lm
	|	UNICODE_CLASS_Lo
	|	UNICODE_CLASS_Nl
	|	{ $UnicodeClassScope::allowedClass = UnicodeCategory.UppercaseLetter;
} UNICODE_ESCAPE_SEQUENCE
	|	{ $UnicodeClassScope::allowedClass = UnicodeCategory.LowercaseLetter;
} UNICODE_ESCAPE_SEQUENCE
	|	{ $UnicodeClassScope::allowedClass = UnicodeCategory.TitlecaseLetter;
} UNICODE_ESCAPE_SEQUENCE
	|	{ $UnicodeClassScope::allowedClass = UnicodeCategory.ModifierLetter;
} UNICODE_ESCAPE_SEQUENCE
	|	{ $UnicodeClassScope::allowedClass = UnicodeCategory.OtherLetter; }
UNICODE_ESCAPE_SEQUENCE
	|	{ $UnicodeClassScope::allowedClass = UnicodeCategory.LetterNumber; }
UNICODE_ESCAPE_SEQUENCE
	;

// A Unicode character of classes Mn or Mc (possibly encoded)
fragment COMBINING_CHARACTER
options{backtrack=true; memoize=true;}
scope UnicodeClassScope;
	:	UNICODE_CLASS_Mn
	|	UNICODE_CLASS_Mc
	|	{ $UnicodeClassScope::allowedClass = UnicodeCategory.NonSpacingMark;
} UNICODE_ESCAPE_SEQUENCE
	|	{ $UnicodeClassScope::allowedClass =
UnicodeCategory.SpacingCombiningMark; } UNICODE_ESCAPE_SEQUENCE
	;

// A Unicode character of the class Nd (possibly encoded)
fragment DECIMAL_DIGIT_CHARACTER
scope UnicodeClassScope;
	:	UNICODE_CLASS_Nd
	|	{ $UnicodeClassScope::allowedClass =
UnicodeCategory.DecimalDigitNumber; } UNICODE_ESCAPE_SEQUENCE
	;

// A Unicode character of the class Pc (possibly encoded)	
fragment CONNECTING_CHARACTER
scope UnicodeClassScope;
	:	UNICODE_CLASS_Pc
	|	{ $UnicodeClassScope::allowedClass =
UnicodeCategory.ConnectorPunctuation; } UNICODE_ESCAPE_SEQUENCE
	;

// A Unicode character of the class Cf (possibly encoded)
fragment FORMATTING_CHARACTER
scope UnicodeClassScope;
	:	UNICODE_CLASS_Cf
	|	{ $UnicodeClassScope::allowedClass = UnicodeCategory.Format; }
UNICODE_ESCAPE_SEQUENCE
	;

// Restricts the unicode escape sequence to certain unicode character
classes
fragment UNICODE_ESCAPE_SEQUENCE
scope UnicodeClassScope;
	:	'\\u' { Char.GetUnicodeCategory((char)
ConvertHexCharArrayIntoInt32(new char[]{
		(char) input.LT(1), (char) input.LT(2), (char) input.LT(3), (char)
input.LT(4)})) == $UnicodeClassScope::allowedClass }?
		HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
	|	'\\U' {
Char.GetUnicodeCategory(TransformUtf32ToUtf16(ConvertHexCharArrayIntoInt32(new
char[]{
		(char) input.LT(1), (char) input.LT(2), (char) input.LT(3), (char)
input.LT(4),
		(char) input.LT(5), (char) input.LT(6), (char) input.LT(7), (char)
input.LT(8)})), 0) == $UnicodeClassScope::allowedClass }?
		HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
HEX_DIGIT
	;


More information about the antlr-interest mailing list