[antlr-interest] Overriding INPUT->istream->consume

Thu Sep 13 06:25:04 PDT 2012

Jim,

I've decided that for my current project, I need to override the
functionality in antlr3UTF8Consume(). I need to correctly handle '\r'
when setting the token line numbers. This means counting '\r' or '\n'
alone each as a newline, and counting '\r' \'n' in sequence as a single
newline. This was easy enough to do (attached as a reference for others,
since I could not find this anywhere).

What I have attached works, but notice that I had to redefine the arrays
trailingBytesForUTF8 and offsetsFromUTF8 to use them in my version of
the code. This is because they are declared as static in
antlr3inputstream.c. I don't like the idea of modifying the distributed
source for the runtime directly to make it not static (this is hard to
maintain properly). I also don't like my current solution of just
duplicating the code. My question for Jim is, is there a better way to
do this, or is this something that could be improved in later versions
of the runtime (this is 3.4)?

Cheers,

- Justin Murray

----
@lexer::apifuncs
{
	INPUT->istream->consume = customUTF8Consume;
}

@lexer::members
{
	// ------------------------------------------------------
	// Following is from Unicode.org (see antlr3convertutf.c)
	//

	/// Index into the table below with the first byte of a UTF-8
sequence to
	/// get the number of trailing bytes that are supposed to follow
it.
	/// Note that *legal* UTF-8 values can't have 4 or 5-bytes. The
table is
	/// left as-is for anyone who may want to do such conversion,
which was
	/// allowed in earlier algorithms.
	///
	static const ANTLR3_UINT32 trailingBytesForUTF8[256] = {
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
		2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
	};

	/// Magic values subtracted from a buffer value during UTF8
conversion.
	/// This table contains as many values as there might be
trailing bytes
	/// in a UTF-8 sequence.
	///
	static const UTF32 offsetsFromUTF8[6] = 
	{ 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL,
0xFA082080UL, 0x82082080UL };

	// End of Unicode.org tables
	// -------------------------

	static void	customUTF8Consume(pANTLR3_INT_STREAM is)
	{
		pANTLR3_INPUT_STREAM    input;
		ANTLR3_UINT32           extraBytesToRead;
		ANTLR3_UCHAR            ch;
		pANTLR3_UINT8           nextChar;

		input   = ((pANTLR3_INPUT_STREAM) (is->super));

		nextChar = (pANTLR3_UINT8)input->nextChar;

		if	(nextChar < (((pANTLR3_UINT8)input->data) +
input->sizeBuf))
		{	
			// Indicate one more character in this line
			//
			input->charPositionInLine++;

			// Are there more bytes needed to make up the
whole thing?
			//
			extraBytesToRead =
trailingBytesForUTF8[*nextChar];

			if	(nextChar + extraBytesToRead >=
(((pANTLR3_UINT8)input->data) + input->sizeBuf))
			{
				input->nextChar =
(((pANTLR3_UINT8)input->data) + input->sizeBuf);
				return;
			}

			// Cases deliberately fall through (see note A
in antlrconvertutf.c)
			// Legal UTF8 is only 4 bytes but 6 bytes could
be used in old UTF8 so
			// we allow it.
			//
			ch  = 0;
			switch (extraBytesToRead) {
			case 5: ch += *nextChar++; ch <<= 6;
			case 4: ch += *nextChar++; ch <<= 6;
			case 3: ch += *nextChar++; ch <<= 6;
			case 2: ch += *nextChar++; ch <<= 6;
			case 1: ch += *nextChar++; ch <<= 6;
			case 0: ch += *nextChar++;
			}

			// Magically correct the input value
			//
			ch -= offsetsFromUTF8[extraBytesToRead];
			if  (ch == '\n')
			{
				/* Reset for start of a new line of
input */
				if ((input->nextChar == input->data) ||
(*((pANTLR3_UINT8)input->nextChar-1) != '\r'))
				{
					// if it is the first character,
or the previous character was not a \r
					input->line++;
				}

				input->charPositionInLine	= 0;
				input->currentLine		= (void
*)nextChar;
			}
			else if (ch == '\r')
			{
				/* Reset for start of a new line of
input
				*/
				input->line++;
				input->charPositionInLine	= 0;
				input->currentLine		= (void
*)nextChar;
			}

			// Update input pointer
			//
			input->nextChar = nextChar;
		}
	}
}
----