[antlr-interest] my nondeterministic grammar

karl wettin kalle at snigel.net
Tue Mar 28 11:00:30 PST 2006


Basically I'm trying to use Antlr as a natrual language parser. I've  
succeded with the grammar before, but as I felt the urge to refactor  
it I I get all these nondeterministic warnings. I understand what  
they mean, but I can understand why I get them.
If there is one rule I don't understand why it is nondeterministic,  
it's the protected EN_NATURAL_EPISODE. And since this rule "fails" my  
grammar is rendered useless. I've been reading up on syntactic  
predicate, but don't think that it's the solution.

There are no trees or any other fancy things. It's a straight forward  
Lexer. It's Java and I have added a main method that explains what I  
try to parse.

Also, I don't understand why my EN_NATURAL_NUMBER returns a token and  
not an integer?

I would be very greatful if someone could take a few minutes and look  
it. I'll even out the karma from freeloading by answering newbie  
questions on other forums until I've figured this out. :)


header {
	package se.snigel.tindex.analysis.tvserie;
	import java.util.*;
	import java.io.*;
	
	import antlr.CommonToken;
	import antlr.Token;	
}


class TVSerieEpisodeLexer extends Lexer;
options {
	k=10;
}
{	
	public class SeasonAndEpisode {
		private int season;
		private int episode;	
		private SeasonAndEpisode(int season, int episode) {
			this.season = season;
			this.episode = episode;
		}
		public int getSeason() {
			return season;
		}
		public int getEpisode() {
			return episode;
		}
		public String toString() {
			return season + "x" + episode;	
		}
	}
	
	private LinkedList<SeasonAndEpisode> episodes = new  
LinkedList<SeasonAndEpisode>();
	private LinkedList<Integer> seasons = new LinkedList<Integer>();
	
	/** for sanity checks */
	private Long fileSize;
	
	private void addSequence(boolean skipFirstEpisode, Integer  
fromSeason, Integer fromEpisode, Integer toSeason, Integer toEpisode) {
         int startEpisode = fromEpisode;
         int endEpisode;
         if (fromSeason == toSeason) {
             endEpisode = toEpisode;
         } else {
             throw new RuntimeException("Need to figure out the last  
episode of this season.");
         }

         LinkedList<SeasonAndEpisode> saes = new  
LinkedList<SeasonAndEpisode>();

         for (int season = fromSeason; season <= toSeason; season++) {
             for (int episode = startEpisode; episode <= endEpisode;  
episode++) {
                 saes.add(new SeasonAndEpisode(season, episode));
             }
             startEpisode = 1;
         }

         if (skipFirstEpisode) {
			saes.removeFirst();
         }

         // how big is the file, and how many episodes are there?
         if (fileSize != null) {
             if (120 * 1024 * saes.size() <= fileSize) {
                 episodes.addAll(saes);
             } else {
                 // todo could it be S1 - 7 as in s1e7?
             }
         }

     }

     public String toString() {
     		StringBuffer buf = new StringBuffer();
     		for (SeasonAndEpisode sae : episodes) {
     			buf.append(sae);
			buf.append('\n');	
     		}
     		for (Integer s : seasons) {
     			buf.append(s);
			buf.append('\n');	
     		}
     		
     		return buf.toString();
     }

	public static void main(String[] args) throws Exception {
	
		String[] testText = new String[]{
			"season 1 jo men visst",
			"s1e1",
			"s1",
			"s 1",
			"season 1",
			"season 1 episode 3",
			"s1,s2,s3",
			"s1-s3",
			"s1e3-4",
			"s1 e4, 5, 6",
			"season one will fail. how do I handle the EN_NATURAL_NUMBER int?",
			"s1 - 4 is this really four seasons? sanity check. is it s1e4?",
			"s1e19-s2e12 what is the last episode of season 1?",
		};
			
		for (String text : testText) {
			System.out.println(text);
			TVSerieEpisodeLexer lexer = new TVSerieEpisodeLexer(new  
StringReader(text));
			lexer.nextToken();
			System.out.println(lexer);
		}
	}
	
}


	
protected WHITESPACE :
	('.' | ' ')+
	;

protected EN_NATURAL_SEASON :
	's' ("eason" ('s')?)?
	;

protected EN_NATURAL_EPISODE :
	'e'('p'("isode" | 's')?)?
	/*"episode" | "eps" | "ep" | "e"*/
	;

protected EN_NATURAL_VECTOR :
	"and" | ","
	;

protected EN_NATURAL_SEQUENCE :
	"to" | "through" | "-"
	;

protected EN_NATURAL_NUMBER returns [int v] :
     ('0'..'9')+ { v=Integer.valueOf($getText); }
     | "one"     { v=1; }
     | "two"     { v=2; }
     | "three"   { v=3; }
     | "four"    { v=4; }
     | "five"    { v=5; }
     | "six"     { v=6; }
     | "seven"   { v=7; }
     | "eight"   { v=8; }
     | "nine"    { v=9; }
     | "ten"     { v=10;}
     ;

	
EN_NATURAL :
	{System.out.println("EN_NATURAL");}
	EN_NATURAL_SEASON (WHITESPACE)?
	startSeason:EN_NATURAL_NUMBER
		
	(	
		// specific episodes
		EN_NATURAL_EPISODE (WHITESPACE)?
		startEpisode:EN_NATURAL_NUMBER	
		(
			{System.out.println("episode sequence");}
			// sequence
			EN_NATURAL_SEQUENCE (WHITESPACE)?
			(
		        EN_NATURAL_SEASON (WHITESPACE)?
		        episodeSequenceEndSeason:EN_NATURAL_NUMBER (WHITESPACE)?	
		    )?
		    EN_NATURAL_EPISODE (WHITESPACE)?
		    episodeSequenceEndEpisode:EN_NATURAL_NUMBER
		    {
		    		addSequence(false,
					new Integer(startSeason.getText()),
					new Integer(startEpisode.getText()),
					episodeSequenceEndSeason == null ? new Integer 
(startSeason.getText()) : new Integer(episodeSequenceEndSeason.getText 
()),
					new Integer(episodeSequenceEndEpisode.getText())
				);
		    }
			
		
	    ) | (
	    		{System.out.println("episode vector");}	    		
	    		// vector
			{
	    			Integer lastSeenSeason = new Integer(startSeason.getText());
	    			episodes.add(new SeasonAndEpisode(new Integer 
(startSeason.getText()), new Integer(startEpisode.getText())));				
	    		}
	    		(	    			
				EN_NATURAL_VECTOR (WHITESPACE)?
				(
					EN_NATURAL_SEASON (WHITESPACE)?
					episodeVectorAndSeason:EN_NATURAL_NUMBER (WHITESPACE)?
					{
						lastSeenSeason = new Integer(episodeVectorAndSeason.getText());	
					}
				)?
				EN_NATURAL_EPISODE (WHITESPACE)?

				episodeVectorAndEpisode:EN_NATURAL_NUMBER (WHITESPACE)?
				{
	            		episodes.add(new SeasonAndEpisode(lastSeenSeason, new  
Integer(episodeVectorAndEpisode.getText())));
	            }
			)+
			
	    		
		) | (
			(.)*
			{System.out.println("a specific episode in a season.");}
			{
				episodes.add(new SeasonAndEpisode(new Integer(startSeason.getText 
()), new Integer(startEpisode.getText())));
			}
			
			
		)  		
	) | (
		{System.out.println("season only.");}
		(	
			{System.out.println("sequence of seasons.");}
			// sequence
			EN_NATURAL_SEQUENCE (WHITESPACE)?
			(EN_NATURAL_SEASON (WHITESPACE)?)?
		    seasonSequenceEndSeason:EN_NATURAL_NUMBER			
		    {
		    		for (int i= new Integer(startSeason.getText()); i <= new  
Integer(seasonSequenceEndSeason.getText()); i++) {
					seasons.add(i);
					// todo sanity check file size	
		    		}		
		    }		    	    		
		) | (
		
			{System.out.println("vector of seasons");}
			// vector			
			{
				seasons.add(new Integer(startSeason.getText()));
			}
			(								
		        EN_NATURAL_VECTOR (WHITESPACE)?
		        (
		            (EN_NATURAL_SEASON (WHITESPACE)?)?
		            seasonVectorAndSeason:EN_NATURAL_NUMBER (WHITESPACE)?
		            {		            		
		            		seasons.add(new Integer(seasonVectorAndSeason.getText 
()));
		            }
		        )
		    )+
		
		) | (		
			(.)*
			{System.out.println("a single season");}
			{
				seasons.add(new Integer(startSeason.getText()));
			}				
		)
	)
	;



More information about the antlr-interest mailing list