[antlr-interest] Simple (should be) lexer Q

Gerald B. Rosenberg gbr at newtechlaw.com
Sat Jun 11 00:19:02 PDT 2005


At 09:47 AM 6/10/2005, Gerald B. Rosenberg wrote:

>Should be simple, but I cannot see the problem.

Full test case.

The problem is with sub-rule 4.  As is, it will fail if (what should be) a 
NAME not containing a COMMA is the very last text in the input stream.  The 
commented out version of sub-rule 4 fails to match on multiple CAPWORDs -- 
the subrule 4 is never used unless something like the (SPACE)? qualifier is 
added.

Any idea what is going on?  Suggested fixes?

Thanks,
Gerald
-------------- next part --------------
package net.certiv.text.citation;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;

import antlr.Token;

public class TestMain {

	/**
	 * The main method. See the header comment for more details.
	 * 
	 * @param args
	 *            the commandline arguments
	 */
	static public void main(String[] args) throws Exception {
		// the name of the file to read
		String fileName = "D:\\DevFiles\\EclipseWorkSpace31M6\\MainDocXmlProcessProject\\src"
				+ "\\net\\certiv\\text\\citation\\";

		fileName += "caseName.txt";

		File file = new File(fileName);
		BufferedInputStream in = new BufferedInputStream(new FileInputStream(file));

		TestLexer lex = new TestLexer(in);

		System.out.println("Run lexer");
		int i = 0;
		boolean done = false;
		while (!done) {
			Token t = (Token) lex.nextToken();
			if (t.getType() == Token.EOF_TYPE) {
				done = true;
			}
			String type = TestParser._tokenNames[t.getType()];
			System.out.println("Token " + i + " (type: " + type + "): " + t.getText());
			i++;
		}
	}
}
-------------- next part --------------
name is John A. Smith one.
name is John Smith, two.
name is John three.
name is John A. Smith, Inc. four.
name is John Smith, Inc. five.
name is John, Inc. six.
the Acme Corp. Universal Machinery is seven.
the Acme Corp. is eight.
the Acme Corporation is nine.
the Mojo Acme System, Ltd. is ten.
Terrible Widget Company. is eleven.
Wimpy Widget Co.

-------------- next part --------------
// Test Grammar file for Name identification
// ----------------------------------------
header {
    package net.certiv.text.citation;
}

// ==========================================================
// Parser

class TestParser extends Parser;

options
{ 
    exportVocab = Test;
    k = 2; 
    buildAST = true;
}

text:
    (textSegment)+ EOF!
;

// must be in preferential order: most specific to catchall
textSegment:
    ( NAME
    | OPAREN
    | CPAREN
    | SEMI
    | COMMA
    | PERIOD
    | SPACE!
    | (options {greedy = true;} : WORD )+  
    )  
; 

// ==========================================================
// Lexer

class TestLexer extends Lexer;

options 
{
    k = 10;
    exportVocab = Test;
    testLiterals = true;
    charVocabulary = '\3'..'\377' | '\u1000'..'\u2fff';
    caseSensitive=true;
    filter = false;
}


EQUAL:     '=';
SEMI:      ';';
COMMA:     ',';
DASH:      '-';
OPAREN:    '(';
CPAREN:    ')';
LANGLE:    '<';
RANGLE:    '>';
SLASH:     '/';
PERIOD:    '.';
SPACE:     ' ';


NAME:
    ( CAPWORDS COMMA WS CAPWORD ) => CAPWORDS COMMA WS CAPWORD { System.out.print("6 "); }
    | CAPWORDS { System.out.print("7 "); }
;

CAPWORDS:
//  ( CAPWORD (WS CAPWORD)+ ) => CAPWORD (WS CAPWORD)+  { System.out.print("4 "); }
    ( CAPWORD (WS CAPWORD)+ (SPACE)? ) => CAPWORD (WS CAPWORD)+ (SPACE)?!  { System.out.print("4 "); }
    | CAPWORD { System.out.print("5 "); }
;

protected
CAPWORD:
    (  UPPERLETTER LETTERS PERIOD) => UPPERLETTER LETTERS PERIOD { System.out.print("1 "); }
    | (UPPERLETTER PERIOD) => UPPERLETTER PERIOD { System.out.print("2 "); }
    | (UPPERLETTER LETTERS) => UPPERLETTER LETTERS  { System.out.print("3 "); }
;

WORD:
   (ALPHA)+ { System.out.print("0 "); }
;

protected
LWRWORD:
    LOWERLETTER (LETTER)* (PERIOD)?
;

protected
LETTERS:
   (LETTER)+
;

protected 
UNQUOTED_STRING:
    	'"'! (~'"')* '"'! 	|	'\''! (~'\'')* '\''!
	;

protected
ALPHA: 
     DIGIT | LETTER | EXTENDEDCHARS | PUNCT 
;

protected
PUNCTEXT:
   '(' | ')' | ';' | '=' | '-'  | ',' | '<' | '>'  
;

protected
PUNCT:
    '!' | '\"' | '#' | '$' | '%'  | '&' | '\'' | '*' | '+' | ',' | '-' | '.' |
    ':' | '?'  | '@' | '[' | '\\' | ']' | '^'  | '_' | '`' | '{' | '|' | '}' | '~'
;


protected
EXTENDEDCHARS:
    '\200'..'\377' | '\u1000'..'\u2fff' 
;

protected
LETTER:
    UPPERLETTER | LOWERLETTER
    // '\101'..'\132' | '\141'..'\172'
;

protected
UPPERLETTER:
    'A'..'Z'
;

protected
LOWERLETTER:
    'a'..'z'
;

protected
DIGIT:
    '0'..'9'
;

WS: 
    ( SPACE
    | '\t'
    | '\r' 
    | '\n' { newline (); }
    )+ { $setType(Token.SKIP); }
;
-------------- next part --------------
----
Gerald B. Rosenberg, Esq.
NewTechLaw
285 Hamilton Avenue, Suite 520
Palo Alto, CA  94301-2576

650.325.2100  (office)  /  650.703.1724  (cell)
650.325.2107  (fax)

www.newtechlaw.com


More information about the antlr-interest mailing list