
unit uPushTagParser;

interface

uses
	Classes;





type
	TPTPError =
	(
		ptpeNone = 0,		// no error (for transition table)
		ptpeUnknown,		// for start, this will be the generic error type reported until we get a well-formed parser
		ptpeEmptyTag,		// <>
		ptpeUnmatchedBracket,		// <tag <
		ptpeMissingWhitespace,		// <tag attr1="value1"attr2="value2">
		ptpeEndTagAttr,		// <tag>  </tag attr="value">
		ptpeMissingValue,		// <tag attr1 attr2="value2">
		ptpeCharWithoutTag,		// text <tag>
		ptpeBadSlash,		// <tag //>
		ptpeInvalidChar,		// #0 .. #32 except for regular whitespace
		
		ptpeMaximum		// unused, only as an enum terminator
	);





	TPTPTagStartEvent  = procedure(Sender: TObject; Tag: string; Attributes: TStrings) of object;
	TPTPTagEndEvent    = procedure(Sender: TObject; Tag: string) of object;
	TPTPErrorEvent     = procedure(Sender: TObject; Error: TPTPError; Ch: char; Row, Column: integer) of object;
	TPTPCharacterEvent = procedure(Sender: TObject; Characters: string) of object;
	TPTPCommentEvent   = procedure(Sender: TObject; Comment: string) of object;

	TPTPState =
	(
		ptpsDocBegin,        // document begin
		ptpsTagStartBracket, // just read the start bracket
		ptpsTagName,         // reading the tag name
		ptpsAttr,            // reading the tag attribute list
		ptpsAttrQuote,       // reading the tag attribute list, inside quotes
		ptpsTagClose,        // just read the end bracket
		ptpsEmptyTagSlash,   // just read the slash at the end of empty tag
		ptpsCharacter,       // reading text not in tags
		ptpsCommentOpen1,    // just read the opening "<!" part of the comment
		ptpsCommentOpen2,    // just read the opening "<!-" part of comment
		ptpsComment,         // inside a comment
		ptpsCommentDash,     // inside a comment, just read a single dash
		ptpsCommentDashes,   // inside a comment, just read at least two consecutive dashes
		ptpsCommentEnd,      // just read the comment closing right bracket "-->"

		ptpsInvalid          // used only as the maximum, enum terminator
	) ;




	TPTPAction = procedure(iChar: char) of object;




	
	TPTPTransitionTableElement = record
		Action: TPTPAction;
		Error: TPTPError;
		NewState: TPTPState;
	end;





	TPushTagParser = class
	public
		OnStartTag:  TPTPTagStartEvent;
		OnEndTag:    TPTPTagEndEvent;
		OnCharacter: TPTPCharacterEvent;
		OnError:     TPTPErrorEvent;
		OnComment:   TPTPCommentEvent;

		constructor Create(); overload;
		constructor Create(iOnStartTag: TPTPTagStartEvent; iOnEndTag: TPTPTagEndEvent; iOnCharacter: TPTPCharacterEvent; iOnError: TPTPErrorEvent; iOnComment: TPTPCommentEvent); overload;
		destructor Destroy(); override;

		procedure Push(Data: PChar; DataSize: integer);

	protected
		// Transition table:
		TransitionTable: array[TPTPState, Char] of TPTPTransitionTableElement;

		// State:
		State: TPTPState;
		StateBeforeComment: TPTPState;
		LastError: TPTPError;
		CurrentRow, CurrentCol: integer;

		// accumulators:
		CurrentTag: string;
		CurrentAttr: string;
		CurrentAttrList: TStringList;
		CurrentTagLevel: integer;
		CurrentComment: string;

		procedure InitTransitionTable();
		procedure TTSetAll         (iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
		procedure TTSetWhitespace  (iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
		procedure TTSetLetters     (iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
		procedure TTSetNumbers     (iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
		procedure TTSetAllPrintable(iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
		procedure TTSetString      (iString: string; iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
		procedure TTSetOne         (iChar: char;     iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);

		procedure DoBeginTag       (iChar: char);
		procedure DoTagName        (iChar: char);
		procedure DoError          (iChar: char);		// propagates LastError up the OnError event
		procedure DoTagClose       (iChar: char);
		procedure DoEmptyTagClose  (iChar: char);
		procedure DoAttr           (iChar: char);
		procedure DoAttrWhitespace (iChar: char);
		procedure DoAttrQuoteEnd   (iChar: char);
		procedure DoCharacter      (iChar: char);
		procedure DoComment1Abandon(iChar: char);
		procedure DoComment2Abandon(iChar: char);
	end;































	
implementation

uses
	SysUtils;





///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// TPushTagParser:

constructor TPushTagParser.Create();
begin
	inherited Create();
	LastError := ptpeNone;
	State := ptpsDocBegin;
	CurrentCol := 0;
	CurrentRow := 0;
	CurrentAttrList := TStringList.Create();
	InitTransitionTable();
end;





constructor TPushTagParser.Create(iOnStartTag: TPTPTagStartEvent; iOnEndTag: TPTPTagEndEvent; iOnCharacter: TPTPCharacterEvent; iOnError: TPTPErrorEvent; iOnComment: TPTPCommentEvent);
begin
	Create();

	OnStartTag  := iOnStartTag;
	OnEndTag    := iOnEndTag;
	OnCharacter := iOnCharacter;
	OnError     := iOnError;
	OnComment   := iOnComment;
end;





destructor TPushTagParser.Destroy();
begin
	CurrentAttrList.Free();

	// TODO
	
	inherited Destroy();
end;





procedure TPushTagParser.InitTransitionTable();
var
	c: Char;
	st: TPTPState;
begin
	// invalid characters:
	for st := Low(TPTPState) to High(TPTPState) do
	begin
		for c := #0 to #31 do
		begin
			TransitionTable[st, c].Error    := ptpeInvalidChar;
			TransitionTable[st, c].Action   := DoError;
			TransitionTable[st, c].NewState := st;
		end;
	end;

	// ptpsDocBegin:
	TTSetAllPrintable(ptpsDocBegin, ptpeCharWithoutTag, DoError,    ptpsDocBegin);
	TTSetWhitespace  (ptpsDocBegin, ptpeNone,           nil,        ptpsDocBegin);
	TTSetOne('<',     ptpsDocBegin, ptpeNone,           DoBeginTag, ptpsTagStartBracket);

	// ptpsTagStartBracket:
	TTSetAllPrintable(ptpsTagStartBracket, ptpeInvalidChar, DoError, ptpsTagStartBracket);		// produce an error by default
	TTSetLetters(ptpsTagStartBracket, ptpeNone, DoTagName, ptpsTagName);
	TTSetNumbers(ptpsTagStartBracket, ptpeNone, DoTagName, ptpsTagName);
	TTSetString('^`~@#$%^&()_-+*', ptpsTagStartBracket, ptpeNone, DoTagName, ptpsTagName);
	TTSetWhitespace(ptpsTagStartBracket, ptpeInvalidChar, DoError, ptpsTagStartBracket);		// whitespace is illegal, but non-fatal
	TTSetOne('!', ptpsTagStartBracket, ptpeNone,     nil,       ptpsCommentOpen1);
	TTSetOne('>', ptpsTagStartBracket, ptpeEmptyTag, DoError,   ptpsTagClose);
	TTSetOne('/', ptpsTagStartBracket, ptpeNone,     DoTagName, ptpsTagName);

	// ptpsTagName:
	TTSetAllPrintable(ptpsTagName, ptpeNone, DoTagname, ptpsTagName);
	TTSetWhitespace  (ptpsTagName, ptpeNone, nil, ptpsAttr);
	TTSetOne('>', ptpsTagName, ptpeNone, DoTagClose,      ptpsTagClose);
	TTSetOne('/', ptpsTagName, ptpeNone, DoEmptyTagClose, ptpsTagClose);
	TTSetWhitespace(ptpsTagName, ptpeNone, nil, ptpsAttr);

	// ptpsAttr:
	TTSetAllPrintable(ptpsAttr, ptpeNone, DoAttr, ptpsAttr);
	TTSetWhitespace  (ptpsAttr, ptpeNone, DoAttrWhitespace, ptpsAttr);
	TTSetOne('"', ptpsAttr, ptpeNone, DoAttr, ptpsAttrQuote);
	TTSetOne('>', ptpsAttr, ptpeNone, DoTagClose, ptpsTagClose);
	TTSetOne('/', ptpsAttr, ptpeNone, nil, ptpsEmptyTagSlash);

	// ptpsAttrQuote:
	TTSetAllPrintable(ptpsAttrQuote, ptpeNone, DoAttr, ptpsAttrQuote);
	TTSetOne('"', ptpsAttrQuote, ptpeNone, DoAttrQuoteEnd, ptpsAttr);
	TTSetOne('>', ptpsAttrQuote, ptpeInvalidChar, DoError, ptpsAttrQuote);

	// ptpsEmptyTagSlash:
	TTSetAllPrintable(ptpsEmptyTagSlash, ptpeInvalidChar, DoError, ptpsAttr);
	TTSetWhitespace(ptpsEmptyTagSlash, ptpeNone, nil, ptpsEmptyTagSlash);
	TTSetOne('>', ptpsEmptyTagSlash, ptpeNone, DoEmptyTagClose, ptpsTagClose);

	// ptpsTagClose:
	TTSetAllPrintable(ptpsTagClose, ptpeNone, DoCharacter, ptpsCharacter);
	TTSetWhitespace  (ptpsTagClose, ptpeNone, DoCharacter, ptpsCharacter);
	TTSetOne('<', ptpsEmptyTagSlash, ptpeNone, DoBeginTag, ptpsTagStartBracket);

	// ptpsCharacter:
	TTSetAllPrintable(ptpsCharacter, ptpeNone, DoCharacter, ptpsCharacter);
	TTSetWhitespace  (ptpsCharacter, ptpeNone, DoCharacter, ptpsCharacter);
	TTSetOne('<',     ptpsCharacter, ptpeNone, DoBeginTag,  ptpsTagStartBracket);

	// ptpsCommentOpen1:
	TTSetAllPrintable(ptpsCommentOpen1, ptpeNone, DoComment1Abandon, ptpsTagName);
	TTSetWhitespace  (ptpsCommentOpen1, ptpeNone, DoComment1Abandon, ptpsTagName);
	TTSetOne    ('-', ptpsCommentOpen1, ptpeNone, nil,               ptpsCommentOpen2);

	// ptpsCommentOpen2:
	TTSetAllPrintable(ptpsCommentOpen2, ptpeNone, DoComment2Abandon, ptpsTagName);
	TTSetWhitespace  (ptpsCommentOpen2, ptpeNone, DoComment2Abandon, ptpsTagName);
	TTSetOne    ('-', ptpsCommentOpen2, ptpeNone, nil,               ptpsComment);

	// ptpsComment:
	TTSetAll(     ptpsComment, ptpeNone, nil, ptpsComment);
	TTSetOne('-', ptpsComment, ptpeNone, nil, ptpsCommentDash);

	// ptpsCommentDash:
	TTSetAll(     ptpsCommentDash, ptpeNone, nil, ptpsComment);
	TTSetOne('-', ptpsCommentDash, ptpeNone, nil, ptpsCommentDashes);

	// ptpsCommentDashes:
	TTSetAll(     ptpsCommentDashes, ptpeNone, nil, ptpsComment);
	TTSetOne('>', ptpsCommentDashes, ptpeNone, nil, ptpsCommentEnd);

	// ptpsCommentEnd:
	TTSetAllPrintable(ptpsCommentEnd, ptpeNone, DoCharacter, ptpsCharacter);
	TTSetOne('<', ptpsCommentEnd, ptpeNone, DoBeginTag, ptpsTagStartBracket);
	
	// TODO
	
end;





procedure TPushTagParser.TTSetAll(iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
var
	c: char;
begin
	for c := #0 to #255 do
	begin
		TransitionTable[iState, c].Error    := iError;
		TransitionTable[iState, c].Action   := iAction;
		TransitionTable[iState, c].NewState := iNewState;
	end;
end;





procedure TPushTagParser.TTSetWhitespace(iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
begin
	TransitionTable[iState, #32].Error    := iError;
	TransitionTable[iState, #32].Action   := iAction;
	TransitionTable[iState, #32].NewState := iNewState;
	TransitionTable[iState, #9].Error    := iError;
	TransitionTable[iState, #9].Action   := iAction;
	TransitionTable[iState, #9].NewState := iNewState;
	TransitionTable[iState, #13].Error    := iError;
	TransitionTable[iState, #13].Action   := iAction;
	TransitionTable[iState, #13].NewState := iNewState;
	TransitionTable[iState, #10].Error    := iError;
	TransitionTable[iState, #10].Action   := iAction;
	TransitionTable[iState, #10].NewState := iNewState;
end;





procedure TPushTagParser.TTSetLetters(iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
var
	c: char;
begin
	for c := 'a' to 'z' do
	begin
		TransitionTable[iState, c].Error    := iError;
		TransitionTable[iState, c].Action   := iAction;
		TransitionTable[iState, c].NewState := iNewState;
	end;
	for c := 'A' to 'Z' do
	begin
		TransitionTable[iState, c].Error    := iError;
		TransitionTable[iState, c].Action   := iAction;
		TransitionTable[iState, c].NewState := iNewState;
	end;
end;





procedure TPushTagParser.TTSetNumbers(iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
var
	c: char;
begin
	for c := '0' to '9' do
	begin
		TransitionTable[iState, c].Error    := iError;
		TransitionTable[iState, c].Action   := iAction;
		TransitionTable[iState, c].NewState := iNewState;
	end;
end;





procedure TPushTagParser.TTSetString(iString: string; iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
var
	i: integer;
begin
	for i := 1 to Length(iString) do
	begin
		TransitionTable[iState, iString[i]].Error    := iError;
		TransitionTable[iState, iString[i]].Action   := iAction;
		TransitionTable[iState, iString[i]].NewState := iNewState;
	end;
end;





procedure TPushTagParser.TTSetOne(iChar: char; iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
begin
	TransitionTable[iState, iChar].Error    := iError;
	TransitionTable[iState, iChar].Action   := iAction;
	TransitionTable[iState, iChar].NewState := iNewState;
end;





procedure TPushTagParser.TTSetAllPrintable(iState: TPTPState; iError: TPTPError; iAction: TPTPAction; iNewState: TPTPState);
var
	c: char;
begin
	for c := #33 to #255 do
	begin
		TransitionTable[iState, c].Error    := iError;
		TransitionTable[iState, c].Action   := iAction;
		TransitionTable[iState, c].NewState := iNewState;
	end;
end;





procedure TPushTagParser.Push(Data: PChar; DataSize: integer);
var
	i: integer;
begin
	// The main loop:
	for i := 0 to DataSize - 1 do
	begin
		with TransitionTable[State, Data[i]] do
		begin
			LastError := Error;
			if Assigned(Action) then
			begin
				Action(Data[i]);
			end;
			State := NewState;
		end;
		if (Data[i] = #13) then
		begin
			CurrentRow := CurrentRow + 1;
			CurrentCol := 0;
		end
		else
		begin
			CurrentCol := CurrentCol + 1;
		end;
	end;
end;





procedure TPushTagParser.DoError(iChar: char);
begin
	if Assigned(OnError) then
	begin
		OnError(Self, LastError, iChar, CurrentRow, CurrentCol);
	end;
end;





procedure TPushTagParser.DoBeginTag(iChar: char);
begin
	CurrentTag := '';
end;





procedure TPushTagParser.DoTagName(iChar: char);
begin
	CurrentTag := CurrentTag + iChar;
end;





procedure TPushTagParser.DoTagClose(iChar: char);
begin
	CurrentAttr := Trim(CurrentAttr);
	if (CurrentAttr <> '') then
	begin
		CurrentAttrList.Add(CurrentAttr);
	end;

	if ((CurrentTag <> '') and (CurrentTag[1] = '/')) then
	begin
		if (CurrentAttrList.Count > 0) then
		begin
			LastError := ptpeEndTagAttr;
			DoError(iChar); 
		end;

		if Assigned(OnEndTag) then
		begin
			OnEndTag(Self, Copy(CurrentTag, 2, Length(CurrentTag)));
		end;
		Exit;
	end;

	if Assigned(OnStartTag) then
	begin
		OnStartTag(Self, CurrentTag, CurrentAttrList);
	end;
	CurrentAttrList.Clear();
end;





procedure TPushTagParser.DoEmptyTagClose(iChar: char);
begin
	CurrentAttr := Trim(CurrentAttr);
	if (CurrentAttr <> '') then
	begin
		CurrentAttrList.Add(CurrentAttr);
	end;
	if Assigned(OnStartTag) then
	begin
		OnStartTag(Self, CurrentTag, CurrentAttrList);
		OnEndTag(Self, CurrentTag);
	end;
	CurrentAttrList.Clear();
end;





procedure TPushTagParser.DoAttr(iChar: char);
begin
	CurrentAttr := CurrentAttr + iChar;
end;





procedure TPushTagParser.DoAttrWhitespace(iChar: char);
begin
	CurrentAttr := Trim(CurrentAttr);
	if (CurrentAttr <> '') then
	begin
		CurrentAttrList.Add(CurrentAttr);
	end;
	CurrentAttr := '';
end;





procedure TPushTagParser.DoAttrQuoteEnd(iChar: char);
begin
	CurrentAttr := TrimLeft(CurrentAttr + '"');
	if (CurrentAttr <> '') then
	begin
		CurrentAttrList.Add(CurrentAttr);
	end;
	CurrentAttr := '';
end;





procedure TPushTagParser.DoCharacter(iChar: char);
begin
	if Assigned(OnCharacter) then
	begin
		OnCharacter(Self, iChar);
	end;
end;





procedure TPushTagParser.DoComment1Abandon(iChar: char);
begin
	CurrentTag := '!';
end;





procedure TPushTagParser.DoComment2Abandon(iChar: char);
begin
	CurrentTag := '!-';
end;




end.

