[antlr-interest] Need help with an address parser

Jeff Bergman jbergmanster at gmail.com
Thu Jun 22 13:55:18 PDT 2006


I am trying to build an address parser but I am having problems.  The
grammar seems to choke on my streetDesignator and streetDesignatorLong in
the Parser section.  I tried putting these in the lexer but fared no
better.  I am not sure how to handle these.  I need them to be separated
because I have to eventually convert the long version to the abbreviated
designators when parsing the address.


options {
    language = "CSharp";
}

class AddressParser extends Parser;
options {
    buildAST = true;    // uses CommonAST by default
}

address
    :    poBox | simpleStreetAddress
    ;

poBox: "PO" boxDesignator NUMBER
    ;

simpleStreetAddress: NUMBER  (IDENT |  streetDesignatorLong |
directionDesignator)+ (streetDesignator | streetDesignatorLong )
(directionDesignator)? (unitDesignator (NUMBER  | IDENT)? )?
;


boxDesignator: "BOX";

unitDesignator :
  ("APT" |
  "BASE" |
  "BLDG" |
  "BSMT" |
  "DEPT" |
  "FL" |
  "FRNT" |
  "HNGR" |
  "LBBY" |
  "LOT" |
  "LOWR" |
  "OFC" |
  "PH" |
  "PIER" |
  "POB" |
  "REAR" |
  "RM" |
  "SIDE" |
  "SLIP" |
  "SPC" |
  "STE" |
  "STOP" |
  "TRLR" |
  "UNIT" |
  "UPPR")
;

 directionDesignator:
    ("E" |
    "N" |
    "NE" |
    "NW" |
    "S" |
    "SE" |
    "SW" |
    "W")
;


streetDesignator:
 "ALY" |
  "ANX" |
  "ARC" |
  "AVE" |
  "BCH" |
  "BG" |
  "BGS" |
  "BLF" |
  "BLFS" |
  "BLVD" |
  "BND" |
  "BR" |
  "BRG" |
  "BRK" |
  "BRKS" |
  "BTM" |
  "BYP" |
  "BYU" |
  "CIR" |
  "CIRS" |
  "CLB" |
  "CLF" |
  "CLFS" |
  "CMN" |
  "COR" |
  "CORS" |
  "CP" |
  "CPE" |
  "CRES" |
  "CRK" |
  "CRSE" |
  "CRST" |
  "CSWY" |
  "CT" |
  "CTR" |
  "CTRS" |
  "CTS" |
  "CURV" |
  "CV" |
  "CVS" |
  "CYN" |
  "DL" |
  "DM" |
  "DR" |
  "DRS" |
  "DV" |
  "EST" |
  "ESTS" |
  "EXPY" |
  "EXT" |
  "EXTS" |
  "FALL" |
  "FLD" |
  "FLDS" |
  "FLS" |
  "FLT" |
  "FLTS" |
  "FRD" |
  "FRDS" |
  "FRG" |
  "FRGS" |
  "FRK" |
  "FRKS" |
  "FRST" |
  "FRY" |
  "FT" |
  "FWY" |
  "GDN" |
  "GDNS" |
  "GLN" |
  "GLNS" |
  "GRN" |
  "GRNS" |
  "GRV" |
  "GRVS" |
  "GTWY" |
  "HBR" |
  "HBRS" |
  "HL" |
  "HLS" |
  "HOLW" |
  "HTS" |
  "HVN" |
  "HWY" |
  "INLT" |
  "IS" |
  "ISLE" |
  "ISS" |
  "JCT" |
  "JCTS" |
  "KNL" |
  "KNLS" |
  "KY" |
  "KYS" |
  "LAND" |
  "LCK" |
  "LCKS" |
  "LDG" |
  "LF" |
  "LGT" |
  "LGTS" |
  "LK" |
  "LKS" |
  "LN" |
  "LNDG" |
  "LOOP" |
  "MALL" |
  "MDW" |
  "MDWS" |
  "MEWS" |
  "ML" |
  "MLS" |
  "MNR" |
  "MNRS" |
  "MSN" |
  "MT" |
  "MTN" |
  "MTNS" |
  "MTWY" |
  "NCK" |
  "OPAS" |
  "ORCH" |
  "OVAL" |
  "PARK" |
  "PASS" |
  "PATH" |
  "PIKE" |
  "PKWY" |
  "PL" |
  "PLN" |
  "PLNS" |
  "PLZ" |
  "PNE" |
  "PNES" |
  "PR" |
  "PRT" |
  "PRTS" |
  "PSGE" |
  "PT" |
  "PTS" |
  "RADL" |
  "RAMP" |
  "RD" |
  "RDG" |
  "RDGS" |
  "RDS" |
  "RIV" |
  "RNCH" |
  "ROW" |
  "RPD" |
  "RPDS" |
  "RR" |
  "RST" |
  "RTE" |
  "RUE" |
  "RUN" |
  "SHL" |
  "SHLS" |
  "SHR" |
  "SHRS" |
  "SKWY" |
  "SMT" |
  "SPG" |
  "SPGS" |
  "SPUR" |
  "SQ" |
  "SQS" |
  "ST" |
  "STA" |
  "STRA" |
  "STRM" |
  "STS" |
  "TER" |
  "TPKE" |
  "TRAK" |
  "TRCE" |
  "TRFY" |
  "TRL" |
  "TRWY" |
  "TUNL" |
  "UN" |
  "UNS" |
  "UPAS" |
  "VIA" |
  "VIS" |
  "VL" |
  "VLG" |
  "VLGS" |
  "VLY" |
  "VLYS" |
  "VW" |
  "VWS" |
  "WALK" |
  "WALL" |
  "WAY" |
  "WAYS" |
  "WL" |
  "WLS" |
  "XING" |
  "XRD" |

;
streetDesignatorLong:
 "ALLEY" |
  "ANNEX" |
  "ARCADE" |
  "AVENUE" |
  "BEACH" |
  "BURG" |
  "BURGS" |
  "BLUFF" |
  "BLUFFS" |
  "BOULEVARD" |
  "BEND" |
  "BRANCH" |
  "BRIDGE" |
  "BROOK" |
  "BROOKS" |
  "BOTTOM" |
  "BYPASS" |
  "BAYOU" |
  "BAYOO" |
  "CIRCLE" |
  "CIRCLES" |
  "CLUB" |
  "CLIFF" |
  "CLIFFS" |
  "COMMON" |
  "CORNER" |
  "CORNERS" |
  "CAMP" |
  "CAPE" |
  "CRESCENT" |
  "CREEK" |
  "COURSE" |
  "CREST" |
  "CAUSEWAY" |
  "COURT" |
  "CENTER" |
  "CENTERS" |
  "COURTS" |
  "CURVE" |
  "COVE" |
  "COVES" |
  "CANYON" |
  "DALE" |
  "DAM" |
  "DRIVE" |
  "DRIVES" |
  "DIVIDE" |
  "ESTATE" |
  "ESTATES" |
  "EXPRESSWAY" |
  "EXTENSION" |
  "EXTENSIONS" |
  "FIELD" |
  "FIELDS" |
  "FALLS" |
  "FLAT" |
  "FLATS" |
  "FORD" |
  "FORDS" |
  "FORGE" |
  "FORGES" |
  "FORK" |
  "FORKS" |
  "FOREST" |
  "FERRY" |
  "FORT" |
  "FREEWAY" |
  "GARDEN" |
  "GARDENS" |
  "GLEN" |
  "GLENS" |
  "GREEN" |
  "GREENS" |
  "GROVE" |
  "GROVES" |
  "GATEWAY" |
  "HARBOR" |
  "HARBORS" |
  "HILL" |
  "HILLS" |
  "HOLLOW" |
  "HEIGHTS" |
  "HAVEN" |
  "HIGHWAY" |
  "INLET" |
  "ISLAND" |
  "ISLANDS" |
  "JUNCTION" |
  "JUNCTIONS" |
  "KNOLL" |
  "KNOLLS" |
  "KEY" |
  "KEYS" |
  "LOCK" |
  "LOCKS" |
  "LODGE" |
  "LOAF" |
  "LIGHT" |
  "LIGHTS" |
  "LAKE" |
  "LAKES" |
  "LANE" |
  "LANDING" |
  "MEADOW" |
  "MEADOWS" |
  "MILL" |
  "MILLS" |
  "MANOR" |
  "MANORS" |
  "MISSION" |
  "MOUNT" |
  "MOUNTAIN" |
  "MOUNTAINS" |
  "MOTORWAY" |
  "NECK" |
  "OVERPASS" |
  "ORCHARD" |
  "PARKS" |
  "PARKWAY" |
  "PARKWAYS" |
  "PLACE" |
  "PLAIN" |
  "PLAINS" |
  "PLAZA" |
  "PINE" |
  "PINES" |
  "PRAIRIE" |
  "PORT" |
  "PORTS" |
  "PASSAGE" |
  "POINT" |
  "POINTS" |
  "RADIAL" |
  "ROAD" |
  "RIDGE" |
  "RIDGES" |
  "ROADS" |
  "RIVER" |
  "RANCH" |
  "RAPID" |
  "RAPIDS" |
  "RURAL ROUTE" |
  "REST" |
  "ROUTE" |
  "SHOAL" |
  "SHOALS" |
  "SHORE" |
  "SHORES" |
  "SKYWAY" |
  "SUMMIT" |
  "SPRING" |
  "SPRINGS" |
  "SPUR(S)" |
  "SQUARE" |
  "SQUARES" |
  "STREET" |
  "STATION" |
  "STRAVENUE" |
  "STREAM" |
  "STREETS" |
  "TERRACE" |
  "TURNPIKE" |
  "TRACK" |
  "TRACE" |
  "TRAFFICWAY" |
  "TRAIL" |
  "THROUGHWAY" |
  "TUNNEL" |
  "UNION" |
  "UNIONS" |
  "UNDERPASS" |
  "VIADUCT" |
  "VISTA" |
  "VILLE" |
  "VILLAGE" |
  "VILLAGES" |
  "VALLEY" |
  "VALLEYS" |
  "VIEW" |
  "VIEWS" |
  "WELL" |
  "WELLS" |
  "CROSSING"
  ;




class AddressLexer extends Lexer;
options {
    k=10;
    testLiterals=false;
}
tokens
{
    ADDRESSDESIGNATORLONG;
}

WS    :    (' '
    |    '\t'
    |   '\f'
    |    '\n'
    |    '\r')
        { _ttype = Token.SKIP; }
    ;


protected
DIGIT
    :    '0'..'9'
    ;
protected
CHAR
    :    'A'..'Z'
    ;

NUMBER    :    (DIGIT)+ ('/' (DIGIT)+ )? (  ('-' ( (CHAR)+ | (DIGIT)+ ) )
|  (CHAR {$setType(IDENT);} )*    )
    ;




IDENT
  options {testLiterals=true;}
  : (CHAR) ( CHAR |'-'|'0'..'9')*
  ;
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://www.antlr.org/pipermail/antlr-interest/attachments/20060622/97346c11/attachment-0001.html


More information about the antlr-interest mailing list