[antlr-interest] Problem with caseSensitivity

Matthew Ford Matthew.Ford at forward.com.au
Thu May 29 16:59:14 PDT 2003


Hi all,
below is a simple grammar which adds one to the height and width of all
images in a html file (to fix a editor bug)
However with
    caseSensitive=false;
when the input is
<link rel="StyleSheet" ...
the output looks like

<link rel="stylesheet" ..

Note: the text inside " " is converted to lowercase.

removing the option caseSensitive=false; fixes this problem but then  the
grammar will only match "img" not "IMG"

What am I missing here?

regards
matthew


{
import java.util.Vector;
import java.io.PrintStream;
}

/** Parse an entire html file, firing events to a single listener
 *  for each image and href link encountered.  All tokens are
 *  defined to skip so the lexer will continue scarfing until EOF.
 was filter=SCARF;
 */
class LinkExtractor extends Lexer;
options {
 //caseSensitive=false;
 k=2;
 filter=SCARF;
 charVocabulary='\3'..'\177';
}

{
 PrintStream output = null;

 public void setOutput(PrintStream out) {
   output = out;
 }


 /** strip quotes from "..." or '...' strings */
 public static String stripQuotes(String src) {
  int h = src.indexOf('"');
  if ( h==-1 ) h = src.indexOf('\'');
  int t = src.lastIndexOf('"');
  if ( t==-1 ) t = src.lastIndexOf('\'');
  if ( h==-1 || t==-1 ) return src;
  return src.substring(h+1,t);
 }

}


IMG : ("<img" {output.print("<img");}
       WS (ATTR)+
    '>' {output.print(">");})
    {$setType(Token.SKIP);}
 ;


protected
ATTR
options {
 ignore=WS;
}
 : w:WORD '=' {output.print(" "+w.getText()+"=");}
  ( s:STRING // output handled below
  | v:WORD {output.print(v.getText());}
  )
  {
    if (s!=null) {
    String heightOrWidth = stripQuotes(s.getText());
    if (( w.getText().equalsIgnoreCase("height") )
      || ( w.getText().equalsIgnoreCase("width") ) ) {
      int size = Integer.parseInt(heightOrWidth);
      size++;
      output.print("\""+size+"\" ");
    } else {
      // some other attribute like src just output
     output.print(s.getText()+" ");
    }
   } else {
     // s==null have output v above
   }
  }
 ;

/** Match until next whitespace; can be file, int, etc... */
protected
WORD: (
   options {
    generateAmbigWarnings=false;
   }
  : 'a'..'z' | '0'..'9' | '/' | '.' | '#' | '_'
  )+
 ;

protected
DIGIT
  : '0'..'9'
  ;

protected
STRING
 : '"' (~'"')* '"'
 | '\'' (~'\'')* '\''
 ;

protected
WS : ( ' ' {output.print(" ");}
  | '\t' {output.print("\t");}
  | '\f'  {output.print("\f");}
  | ( "\r\n"  {output.print("\r\n");} // DOS
   | '\r'   {output.print("\r");} // Macintosh
   | '\n'   {output.print("\r");} // Unix (the right way)
   )
   { newline(); }
  )
  { //$setType(Token.SKIP);
  }
 ;

protected
SCARF
 : WS // track line numbers while you scarf
 | a:. {output.print(a);}
 ;

----------------------------------------------------
Dr. Matthew.Ford
Forward Computing & Control Pty. Ltd.
www.forward.com.au


 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/ 




More information about the antlr-interest mailing list