[antlr-interest] Problem with caseSensitivity
Matthew Ford
Matthew.Ford at forward.com.au
Thu May 29 16:59:14 PDT 2003
Hi all,
below is a simple grammar which adds one to the height and width of all
images in a html file (to fix a editor bug)
However with
caseSensitive=false;
when the input is
<link rel="StyleSheet" ...
the output looks like
<link rel="stylesheet" ..
Note: the text inside " " is converted to lowercase.
removing the option caseSensitive=false; fixes this problem but then the
grammar will only match "img" not "IMG"
What am I missing here?
regards
matthew
{
import java.util.Vector;
import java.io.PrintStream;
}
/** Parse an entire html file, firing events to a single listener
* for each image and href link encountered. All tokens are
* defined to skip so the lexer will continue scarfing until EOF.
was filter=SCARF;
*/
class LinkExtractor extends Lexer;
options {
//caseSensitive=false;
k=2;
filter=SCARF;
charVocabulary='\3'..'\177';
}
{
PrintStream output = null;
public void setOutput(PrintStream out) {
output = out;
}
/** strip quotes from "..." or '...' strings */
public static String stripQuotes(String src) {
int h = src.indexOf('"');
if ( h==-1 ) h = src.indexOf('\'');
int t = src.lastIndexOf('"');
if ( t==-1 ) t = src.lastIndexOf('\'');
if ( h==-1 || t==-1 ) return src;
return src.substring(h+1,t);
}
}
IMG : ("<img" {output.print("<img");}
WS (ATTR)+
'>' {output.print(">");})
{$setType(Token.SKIP);}
;
protected
ATTR
options {
ignore=WS;
}
: w:WORD '=' {output.print(" "+w.getText()+"=");}
( s:STRING // output handled below
| v:WORD {output.print(v.getText());}
)
{
if (s!=null) {
String heightOrWidth = stripQuotes(s.getText());
if (( w.getText().equalsIgnoreCase("height") )
|| ( w.getText().equalsIgnoreCase("width") ) ) {
int size = Integer.parseInt(heightOrWidth);
size++;
output.print("\""+size+"\" ");
} else {
// some other attribute like src just output
output.print(s.getText()+" ");
}
} else {
// s==null have output v above
}
}
;
/** Match until next whitespace; can be file, int, etc... */
protected
WORD: (
options {
generateAmbigWarnings=false;
}
: 'a'..'z' | '0'..'9' | '/' | '.' | '#' | '_'
)+
;
protected
DIGIT
: '0'..'9'
;
protected
STRING
: '"' (~'"')* '"'
| '\'' (~'\'')* '\''
;
protected
WS : ( ' ' {output.print(" ");}
| '\t' {output.print("\t");}
| '\f' {output.print("\f");}
| ( "\r\n" {output.print("\r\n");} // DOS
| '\r' {output.print("\r");} // Macintosh
| '\n' {output.print("\r");} // Unix (the right way)
)
{ newline(); }
)
{ //$setType(Token.SKIP);
}
;
protected
SCARF
: WS // track line numbers while you scarf
| a:. {output.print(a);}
;
----------------------------------------------------
Dr. Matthew.Ford
Forward Computing & Control Pty. Ltd.
www.forward.com.au
Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/
More information about the antlr-interest
mailing list