[antlr-interest] Memory issue while lexer initialization

Federico Di Gregorio fog at initd.org
Fri Dec 28 10:37:13 PST 2007


Il giorno ven, 28/12/2007 alle 10.10 -0800, Terence Parr ha scritto:
> hi. you must have one huge grammar.  Do you have a lexer with a  
> million tokens?

About lexer size, I have a pretty simple (I hope) set of rules that
generates i pretty huge transitions table: about 70000 lines of 16
shorts each, for a total of about 2100 Kb of static data. I understand
2100 Kb is not that much but any suggestion on how to reduce the lexer
size (now my code takes 3x time to cmpile) is welcome.

The grammar is attached to this email.

federico

-- 
Federico Di Gregorio                         http://people.initd.org/fog
Debian GNU/Linux Developer                                fog at debian.org
INIT.D Developer                                           fog at initd.org
                   I came like Water, and like Wind I go. -- Omar Khayam
-------------- next part --------------
grammar llTurtle;

options {
	language=CSharp;
	output=AST;
	ASTLabelType=CommonTree;
}

tokens {
	BLANK;
	LITERAL;
	COLLECTION;
	STATEMENTS;
	PREDICATEOBJECTS;
}

@parser::namespace { Greengrass.Parser.Turtle }
@lexer::namespace  { Greengrass.Parser.Turtle }

@header {
	using System.Collections;
	using System.Collections.Generic;

	// Disable CS0219 (assigned but never used) and CS0162 (unreachable code)
	// to avoid spurious warnings from Antlr generated code.
	#pragma warning disable 219, 162
}

@members {
	public override string GetErrorMessage(RecognitionException e, string[] tokenNames)
	{
		string msg = base.GetErrorMessage(e, tokenNames);
		throw new ParserException(msg, e.Line, e.CharPositionInLine, e);
	}
	
	public override string GetTokenErrorDisplay(IToken t)
	{
		return t.ToString();
	}
}

/* PARSER */

document : statement* EOF ;

statement
	: directive '.'!
	| triples '.'!
	;

directive
	: prefixDirective
	| baseDirective
	;
	
prefixDirective
	: PREFIX PNAME URIREF -> ^(PREFIX PNAME URIREF)
	;

baseDirective
	: BASE URIREF -> ^(BASE URIREF)
	;
 	
triples
	: subject predicateObjectList -> ^(STATEMENTS subject predicateObjectList)
	;

predicateObjectList
	: verb objectList (';' verb objectList)* ';'? -> ^(PREDICATEOBJECTS verb objectList)+
	;

objectList
	: obj (',' obj)* -> obj+
	;

verb
	: A -> { adaptor.Create(URIREF, "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>") }
	| resource
	;

subject
	: resource
	| blank
	| VARDOLLAR
	| VARQMARK
	;

predicate
	: resource
	| VARDOLLAR
	| VARQMARK
	;
	
obj
	: resource
	| blank
	| literal
	| VARDOLLAR
	| VARQMARK
	;

literal
	: s=quotedString (lang | datatype)? -> ^(LITERAL $s lang? datatype?)
	| INTEGER -> ^(LITERAL INTEGER { adaptor.Create(URIREF, "<http://www.w3.org/2001/XMLSchema#int>") } )
	| DOUBLE  -> ^(LITERAL DOUBLE  { adaptor.Create(URIREF, "<http://www.w3.org/2001/XMLSchema#double>") } )
	| DECIMAL -> ^(LITERAL DECIMAL { adaptor.Create(URIREF, "<http://www.w3.org/2001/XMLSchema#decimal>") } )
	| BOOLEAN -> ^(LITERAL BOOLEAN { adaptor.Create(URIREF, "<http://www.w3.org/2001/XMLSchema#boolean>") } )
	;

lang
	: '@' LANG -> LANG
	;

datatype
	: '^^' URIREF -> URIREF
	;

blank
	: NODEID -> ^(BLANK NODEID)
	| '[]' -> BLANK
	| '[' predicateObjectList ']' -> ^(STATEMENTS BLANK predicateObjectList)
	| collection
	;

collection
	: '(' obj* ')' -> ^(COLLECTION obj*)
	;
	
resource
	: URIREF
	| QNAME
	// FIXME: unfortunately a PNAME is a perfectly valid QNAME and we need this hack.
	| PNAME -> { adaptor.Create(QNAME, $PNAME.Text) }
	;

quotedString
	: STRING
	| LONGSTRING
	;

/* LEXER */

BOOLEAN
	: 'true' | 'false'
	;

INTEGER
	: ('-'|'+')? DIGIT+
	;
	
DOUBLE
	: ('-'|'+')? (DIGIT+ '.' DIGIT* EXPONENT | '.' DIGIT+ EXPONENT | DIGIT+ EXPONENT)
	;

DECIMAL
	: ('-'|'+')? (DIGIT+ '.' DIGIT* | '.' DIGIT+)
	;

A
	: 'a'
	;

NODEID 
	: '_:'! PN_NAME_START PN_NAME*
	;

PNAME
	: (PN_NAME_START PN_NAME*)? ':'
	;

QNAME
	: (PN_NAME_START_UNDERSCORE PN_NAME*)? ':' PN_NAME*
	;

BASE
	: '@base'
	;
	
PREFIX
	: '@prefix'
	;

URIREF
	: '<' ( ~(CONTROL) | '"' | '\\>' | '\\t' | '\\n' | '\\r' | '\\\\' | '\\u' | '\\U' )* '>'
	;

STRING
	: '"' ( ~(CONTROL) | '>' | '\\t' | '\\n' | '\\r' | '\\\\' | '\\"' | '\\u' | '\\U' )* '"'
	;

LONGSTRING
	: '"""' ( ~(CONTROL) | '"' ~'"' | '>' | '\\t' | '\\n' | '\\r' | '\\\\' | '\\"' | '\\u' | '\\U' | '\t' | '\n' | '\r' )* '"""'
	;

LANG
	: ('a'..'z')+ ( '-' ('a'..'z' | '0'..'9')+ )*
	;

VARDOLLAR
	: '$' PN_NAME+
	;
	
VARQMARK
	: '?' PN_NAME+
	;

WS
	: (' ' | '\t' | '\f' | EOL )+ { $channel = HIDDEN; }
	;

COMMENT
	: '#' (~('\n' | '\r'))* EOL { $channel = HIDDEN; }
	;

fragment EOL
	: ('\r'? '\n' | '\r')
	;
	
fragment PN_NAME_START
	: 'A'..'Z' | 'a'..'z'
    | '\u00c0'..'\u00d6' | '\u00d8'..'\u00f6' | '\u00f8'..'\u02ff' | '\u0370'..'\u037d'
    | '\u037f'..'\u1fff' | '\u200c'..'\u200d' | '\u2070'..'\u218f' | '\u2c00'..'\u2fef'
    | '\u3001'..'\ud7ff' | '\uf900'..'\ufdcf' | '\ufdf0'..'\ufffd'
	;

fragment PN_NAME_START_UNDERSCORE
	: PN_NAME_START | '_'
	;

fragment PN_NAME
	: PN_NAME_START_UNDERSCORE | '-' | '0'..'9' | '\u00b7' | '\U0300'..'\U036f' | '\U203f'..'\U2040'
	;

fragment CONTROL
	: '\u0000'..'\u001f' | '"' | '\\' | '>'
	;

fragment DIGIT
	: '0'..'9'
	;

fragment EXPONENT
	: ('e'|'E')('+'|'-')?DIGIT+
	;


More information about the antlr-interest mailing list