[antlr-interest] Trouble getting fuzzy lexer to recognize ID followed by asterisk

Rick Mann rmann at latencyzero.com
Wed Mar 7 20:06:20 PST 2007


I've got a fuzzy lexer to extract a few interesting symbols from .h  
and .d files (C headers and D modules, both similar in syntax). It  
does okay getting a function declaration like:

extern Boolean
InvokeNavPreviewUPP(
   NavCBRecPtr    callBackParms,
   void*          callBackUD,
   NavPreviewUPP  userUPP)  SOME_MACRO;

But fails if one of the argument types has whitespace before the  
asterisk. No matter what I do, I can't seem to get it to work. The  
rule that matches on type looks like this:

fragment
TYPE:   QID ('[]'| '*'+)?
     ;


I want to change it to:

fragment
TYPE:   QID ('[]'| (WS? '*')+)?
     ;

But then it can't even match the void* (with no space) or even void  
without an asterisk. The entire grammar is reproduced below. I'd sure  
appreciate some help. Thanks!

--
Rick





lexer grammar FuzzyD;
options
{
	filter=true;
}
	
MODULE
	:	'module' WS name=QIDStar WS? ';'
		 {System.out.println("Module:\t"+$name.text);}
	;
	
IMPORT
	:	'import' WS name=QIDStar WS? ';'
	;
	
/** Avoids having "return foo;" match as a field */
RETURN
	:	'return' (options {greedy=false;}:.)* ';'
	;

CLASS
	:	'class' WS name=ID WS?
		(':' WS QID WS? (',' WS? QID WS?)*)? '{'
         {System.out.println("Class:\t"+$name.text);}
	;
	
STRUCT
	:	'struct' WS name=ID WS?
		'{'
         {System.out.println("Struct:\t"+$name.text);}
	;
	
TYPEDEF
	:	('typedef'|'alias') WS baseName=TYPE WS name=ID WS? ';'
         {System.out.println("Typedef:\t" + $baseName.text + "\t" +  
$name.text);}
	;
	
METHDECL
     :   TYPE WS name=ID WS? '(' WS? ( ARG WS? (',' WS? COMMENT? WS?  
ARG WS?)* )? ')' WS?
        ('throws' WS QID WS? (',' WS? QID WS?)*)? WS? ID? ';'
         {System.out.println("MethodDecl:\t"+$name.text);}
     ;

METHOD
     :   TYPE WS name=ID WS? '(' WS? ( ARG WS? (',' WS? ARG WS?)* )?  
')' WS?
        ('throws' WS QID WS? (',' WS? QID WS?)*)? '{'
         {System.out.println("Method:\t"+$name.text);}
     ;

FIELD
     :   TYPE WS name=ID '[]'? WS? (';'|'=')
//        {System.out.println("Field:\t" + $name.text);}
     ;

STAT:	('if'|'while'|'switch'|'for') WS? '(' ;
	
CALL
     :   name=QID WS? '('
//       {/*ignore if this/super */ System.out.println("found call "+ 
$name.text);}
     ;

COMMENT
     :   '/*' (options {greedy=false;} : . )* '*/'
//        {System.out.println("found comment "+getText());}
     ;

SL_COMMENT
     :   '//' (options {greedy=false;} : . )* '\n'
//        {System.out.println("found // comment "+getText());}
     ;
	
STRING
	:	'"' (options {greedy=false;}: ESC | .)* '"'
	;

CHAR
	:	'\'' (options {greedy=false;}: ESC | .)* '\''
	;

WS  :   (' '|'\t'|'\n'|'\r')+
     ;

fragment
QID :	ID ('.' ID)*
	;
	
/** QID cannot see beyond end of token so using QID '.*'? somewhere  
won't
*  ever match since k=1 lookahead in the QID loop of '.' will make it  
loop.
*  I made this rule to compensate.
*/
fragment
QIDStar
	:	ID ('.' ID)* '.*'?
	;

fragment
TYPE:   QID ('[]'| '*'+)?
     ;

fragment
ARG :   (('in' | 'out' | 'inout' | 'const') WS)? TYPE WS ID
//		{ System.out.println("Found argument " + $ID.text + " of type " +  
$TYPE.text); }
     ;

fragment
ID  :   ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'_'|'0'..'9')*
     ;

fragment
ESC	:	'\\' ('"'|'\''|'\\')
	;



More information about the antlr-interest mailing list