[antlr-interest] more newbie help required

karl wettin kalle at snigel.net
Tue Feb 7 16:49:42 PST 2006


Hi again,

thanks for the inital help. I'm really happy to see that the list is  
active. It was quite impossible to figure out or get help with JavaCC.

(I presume I want to use a tree later on, but for a starter I'll just  
try to get my grammar to work.)

Here is my current problems:

1. I can't figure out why I can't get the matched value of  
expressions within one expression. It's all null. It obviously match.

2. The NUM expression does not work. Try to uncomment the meta:(/*NUM  
|*/ ALPHANUMWORD | LETTER | DIGIT)

3. I have plenty of nondeterminism warnings I presume are related  
with the above, but why they are nondeterminal I have no idea.

Any other comments on the grammar is of course also welcome.


Output from my Main:

null
["s1e9-10",<6>,line=1,col=1]
null
null
null
null
null
["evilpirates",<18>,line=1,col=1]


package se.snigel.tpb.analysis.tvshow;
import java.io.StringReader;
public class Main {	
	public static final void main (String[] args) throws Exception {
		String val = "some.show.s1e9-10.xvid.ac3.5.1.hdtv-evilpirates";
		new TVShowParser(new TVShowLexer(new StringReader(val))).expr();
	}
}



header
{
   package se.snigel.tpb.analysis.tvshow;
   import java.util.*;
}
class TVShowParser extends Parser;

expr :
	WELL_FORMATTED
	| BAD_FORMATTED
     ;
	
class TVShowLexer extends Lexer;

options {
     k=3; // presumed needed for SEASON_AND_EPISODE
}

WELL_FORMATTED : 	
	show:(		
		options { greedy=false; } :
		ALPHANUMWORD ('.' | ('-' ALPHANUMWORD '.')) 		
	)+
	{System.out.println(show);}
	
	sae:SEASON_AND_EPISODE
	{System.out.println(sae);}
	
	((
		options { greedy=false; } :		
		'.'
		meta:(/*NUM |*/ ALPHANUMWORD | LETTER | DIGIT)
		{System.out.println(meta);}
	)+)?
	
	'-'
	group:RELEASE_GROUP
	{System.out.println(group);}
	
	;

BAD_FORMATTED :
	(	
		options { greedy=false; } :
		ALPHANUMWORD ((' ' | PUNCT)+)?
	)+
	SEASON_AND_EPISODE
	;

SEASON_AND_EPISODE :		
	STANDARD
	| COMPOSITE
	| SXE
	;
	
STANDARD :
	STANDARD_SEASON (('.' | ' ')+)? STANDARD_EPISODE	
	(	
		STANDARD_VECTOR
		| STANDARD_SEQUENCE
	)?
	;
	
STANDARD_SEASON :
	("season" | "sŠsong" | 's')
	(('.' | ' ')+)?
	SEASON_NUMBER
	;

STANDARD_EPISODE :
	("episode" | ('e'('p'('s')?)?) | "avsnitt")
	(('.' | ' ')+)?
	EPISODE_NUMBER
	;
	
STANDARD_SEQUENCE :
	'-'
	(STANDARD_SEASON)?
	(STANDARD_EPISODE | EPISODE_NUMBER)
	;

STANDARD_VECTOR : 	
	(
		(' ' | '+' | '.' | ',')
		(STANDARD_SEASON)?
		STANDARD_EPISODE
	)+
	;
		
SXE :
	SEASON_NUMBER 'x' EPISODE_NUMBER
	;	

/** 1 - 49 */
SEASON_NUMBER: 	
	('0')? '1'..'9'
	| '1'..'4' '0'..'9'
	;

/** 1 - 39 */
EPISODE_NUMBER:
	('0')? '1'..'9'
	| '1'..'3' '0'..'9'
	;

/** 109 */
COMPOSITE :
	COMPOSITE_SEASON COMPOSITE_EPISODE
	;
	
	
/** 1 - 9 */
COMPOSITE_SEASON :
	'1'..'9'
	;

/** 1 - 39 */
COMPOSITE_EPISODE :
	'0' '1'..'9'
	| '1'..'3' '0'..'9'
	;

RELEASE_GROUP :	
	(ALPHANUMWORD | ACRONYM)
	;

/** unicode roman letters */
LETTER:					
     '\u0041'..'\u005a'
     | '\u0061'..'\u007a'
     | '\u00c0'..'\u00d6'
     | '\u00d8'..'\u00f6'
     | '\u00f8'..'\u00ff'
     | '\u0100'..'\u1fff'
	;
	
/** unicode korean */
KOREAN :
	'\uac00' .. '\ud7af'
	;
	
/** chinese and japanese */
CJ :
	'\u3040'..'\u318f'
     | '\u3300'..'\u337f'
     | '\u3400'..'\u3d2d'
     | '\u4e00'..'\u9fff'
     | '\uf900'..'\ufaff'
	;


/** basic word: a sequence of digits & letters */
ALPHANUMWORD :
	(LETTER | DIGIT | KOREAN)+
     ;

/** I.B.M., C.S.I. et.c. becomes IBM, CSI.. */
ACRONYM :
	LETTER '.'! (LETTER '.'!)+ 	
	;

/** word with at least one digit */
HAS_DIGIT :
     (LETTER | DIGIT)*
     DIGIT
     (LETTER | DIGIT)*
	;
	
PUNCT :
	'_'
	|'-'
	|'/'
	|'.'
	|','
	;	

/** floating point, serial, model numbers, ip addresses, et.c.
  	every other segment must have at least one digit */
NUM :	
	ALPHANUMWORD PUNCT HAS_DIGIT
	| HAS_DIGIT PUNCT ALPHANUMWORD
	| ALPHANUMWORD (PUNCT HAS_DIGIT PUNCT ALPHANUMWORD)+
	| HAS_DIGIT (PUNCT ALPHANUMWORD PUNCT HAS_DIGIT)+
	| ALPHANUMWORD PUNCT HAS_DIGIT (PUNCT ALPHANUMWORD PUNCT HAS_DIGIT)+
	| HAS_DIGIT PUNCT ALPHANUMWORD (PUNCT HAS_DIGIT PUNCT ALPHANUMWORD)+
	;
	
/** unicode digits */
DIGIT :
     '\u0030'..'\u0039'
     | '\u0660'..'\u0669'
     | '\u06f0'..'\u06f9'
     | '\u0966'..'\u096f'
     | '\u09e6'..'\u09ef'
     | '\u0a66'..'\u0a6f'
     | '\u0ae6'..'\u0aef'
     | '\u0b66'..'\u0b6f'
     | '\u0be7'..'\u0bef'
     | '\u0c66'..'\u0c6f'
     | '\u0ce6'..'\u0cef'
     | '\u0d66'..'\u0d6f'
     | '\u0e50'..'\u0e59'
     | '\u0ed0'..'\u0ed9'
     | '\u1040'..'\u1049'
	;
	
	





More information about the antlr-interest mailing list