[antlr-interest] Embedded Languages

Tue Feb 4 07:06:00 PST 2003

Hi,

I need to parse an embedded language. I basically wish to reproduce 
the input to stdout except for delimited sections of the input stream 
which must be parsed, much like the multiplex javadoc sample.

I have tried to combine the SED sample and the javadoc sample to get 
what I need, but I am struggling to get it working.

I have pasted in the four files I have used - modified from the 
javadoc sample.

I am pretty sure that if I just want to reproduce input except for 
certain delimited sections of the text, that I should be able to use 
one lexer and one parser.

I don't actually need to parse the complete input text, just the 
delimited sections.

There must be an easier way to do this. I have attached my attempt in 
case it is of some use.

START OF CODELEX.G
-------------------------------------------------------------------
header {
#include "antlr/TokenStreamSelector.hpp"
#include <iostream>
}

options {

	language="Cpp";
}

class CodeLexer extends Lexer;
options {
	k=2;
	filter=IGNORE;
        importVocab = Common;
	exportVocab = XGEN;
}

{
private:
	ANTLR_USE_NAMESPACE(antlr)TokenStreamSelector* selector;
public:
	void setSelector(ANTLR_USE_NAMESPACE(antlr)
TokenStreamSelector* selector_) {
		selector=selector_;
	}
}

protected
IGNORE
  :  ( "\r\n" /* | '\r' | '\n' */ )
     {newline(); std::cout << std::endl;}
  |  c:. {std::cout << c;}
  ;

XGENCODE_OPEN
	:	"[" {selector->push("doclexer");}
	;
-------------------------------------------------------------------
END OF CODELEX.G

START OF CODEPARSE.G
-------------------------------------------------------------------
options {
	language="Cpp";
}

{
#include "XGenParser.hpp"
}

class CodeParser extends Parser;
options {
	importVocab=XGEN;
}

input
	:	(xgencode)*

	;

xgencode
	:	XGENCODE_OPEN
		{
		XGenParser xgen(getInputState());
		xgen.content();
		}
		XGENCODE_CLOSE
	;
-------------------------------------------------------------------
END OF CODEPARSE.G

START OF XGENLEX.G
-------------------------------------------------------------------
header {
#include "antlr/TokenStreamSelector.hpp"
}

options {
	language="Cpp";
}

class XGenLexer extends Lexer;
options {
	k=2;
	importVocab = Common;
	exportVocab = XGEN;
	filter=true;
}

{
private:
	ANTLR_USE_NAMESPACE(antlr)TokenStreamSelector* selector;
public:
	void setSelector(ANTLR_USE_NAMESPACE(antlr)
TokenStreamSelector* selector_) {
		selector=selector_;
	}
}

FOREACH
	:	"foreach" ' ' ID
	;

protected
ID	:	('a'..'z'|'A'..'Z')+
	;

/** This rule simply prevents XGENCODE_CLOSE from being
 *  called for every '[' in a comment.  Calling XGENCODE_CLOSE
 *  will fail for simple '[' and cause an exception, which
 *  is slow.  In other words, the grammar will work without
 *  this rule, but is slower.
 */
OPEN:	'[' {$setType(ANTLR_USE_NAMESPACE(antlr)Token::SKIP);}
	;

XGENCODE_CLOSE
	:	"]" {selector->pop();}
	;

/** Ignore whitespace inside xgen code comments */
NEWLINE
	:	(	"\r\n"  // Evil DOS
		|	'\r'    // Macintosh
		|	'\n'    // Unix (the right way)
		)
		{ newline(); $setType(ANTLR_USE_NAMESPACE(antlr)
Token::SKIP); }
	;
-------------------------------------------------------------------
END OF XGENLEX.G

START OF XGENPARSE.G
-------------------------------------------------------------------
options {
	language="Cpp";
}

{
#include <iostream>
}

class XGenParser extends Parser;
options {
	importVocab=XGEN;
}

content
	:	(	p:FOREACH	// includes ID as part of 
PARAM
			{std::cout << "found: " << p->getText() << 
std::endl;}
		)*
	;

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/