Unit RichTextDocumentUnit;

{$mode objfpc}{$H+}
// Declarations of tags, and parsing functions

Interface

uses
  Classes
  ,fpg_base
  ;

type
  COUNTRYCODE = string[2];

  TTagType = ( ttInvalid,
               ttBold, ttBoldOff,
               ttItalic, ttItalicOff,
               ttUnderline, ttUnderlineOff,
               ttFixedWidthOn, ttFixedWidthOff,
               ttHeading1, ttHeading2, ttHeading3,
               ttHeading1Off, ttHeading2Off, ttHeading3Off,
               ttColor, ttColorOff,
               ttBackgroundColor, ttBackgroundColorOff,
               ttRed, ttBlue, ttGreen, ttBlack,
               ttRedOff, ttBlueOff, ttGreenOff, ttBlackOff,
               ttNoWrap,ttNoWrapOff,
               ttAlign,
               ttBeginLink, ttEndLink,
               ttSetLeftMargin, ttSetRightMargin,
               ttImage,
               ttFont, ttFontOff,
               ttEnd );

  TStandardColor = record
    Name: string[ 32 ];
    Color: TfpgColor;
  end;

  TTag = record
    TagType: TTagType;
    Arguments: string;
  end;

  TTextElementType = ( teText, // a character
                       teWordBreak,
                       teLineBreak, // end of para
                       teTextEnd,
                       teImage,
                       teStyle );

  TTextElement = record
    ElementType: TTextElementType;
    Character: TfpgChar;  // so we can support UTF-8 character
    Tag: TTag;
  end;

  TTextAlignment = ( taLeft,
                     taRight,
                     taCenter );

const
  TagStr: array[ ttInvalid .. ttEnd ] of string =
  (
    '', //
    'b',
    '/b',
    'i',
    '/i',
    'u',
    '/u',
    'tt',
    '/tt',
    'h1',
    'h2',
    'h3',
    '/h1',
    '/h2',
    '/h3',
    'color',
    '/color',
    'backcolor',
    '/backcolor',
    'red',
    'blue',
    'green',
    'black',
    '/red',
    '/blue',
    '/green',
    '/black',
    'nowrap',
    '/nowrap',
    'align',
    'link',
    '/link',
    'leftmargin',
    'rightmargin',
    'image',
    'font',
    '/font',
    ''
  );


// Returns tag pointed to by TextPointer and
// moves TextPointer to the first char after the tag.
Function ExtractTag( Var TextPointer: PChar ): TTag;

// Returns tag ending at TextPointer
// (Expects textpointer is currently pointing at the >)
// and moves TextPointer to the first char of the tag
Function ExtractPreviousTag( const TextStart: PChar; Var TextPointer: PChar ): TTag;
function ExtractNextTextElement( TextPointer: PChar; Var NextElement: PChar ): TTextElement;
function ExtractPreviousTextElement( const TextStart: PChar; TextPointer: PChar; Var NextElement: PChar ): TTextElement;

// Parse a color name or value (#hexval). Returns true if valid
function GetTagColor( const ColorParam: string; var Color: TfpgColor ): boolean;

function GetTagTextAlignment( const AlignParam: string;
                              const Default: TTextAlignment ): TTextAlignment;

function GetTagTextWrap( const WrapParam: string ): boolean;

// Search within a rich text document for the given text
// if found, returns true, pMatch is set to the first match,
//   and MatchLength returns the length of the match
//   (which may be greater than the length of Text due to
//    to skipping tags)
// if not found, returns false, pMatch is set to nil
function RichTextFindString( pRichText: PChar;
                             const Text: string;
                             var pMatch: PChar;
                             var MatchLength: longint ): boolean;

// Returns the start of the previous word,
// or the current word if pStart is in the middle of the word
function RichTextWordLeft( pRichText: PChar;
                           pStart: PChar ): PChar;

// Returns the start of the next word.
function RichTextWordRight( pStart: PChar ): PChar;

// If pStart is in the middle of a word, then
// returns true and sets the start and length of the word
function RichTextWordAt( pRichText: PChar;
                         pStart: PChar;
                         Var pWordStart: PChar;
                         Var WordLength: longint ): boolean;

// Copies plaintext of richtext starting at StartP
// to the given buffer. Returns number of characters copied.
// Buffer may be nil
// If BufferLength is negative, it is effectively ignored
function CopyPlainTextToBuffer( StartP: PChar;
                                EndP: PChar;
                                Buffer: PChar;
                                BufferLength: longint ): longint;

Implementation

uses
  SysUtils
  ,ACLStringUtility
  ,fpg_stringutils
  ;

const
  StandardColors: array[ 0..7 ] of TStandardColor =
  (
    ( Name : 'white' ; Color: clWhite   ),
    ( Name : 'black' ; Color: clBlack   ),
    ( Name : 'red'   ; Color: clRed     ),
    ( Name : 'blue'  ; Color: clBlue    ),
    ( Name : 'green' ; Color: clLime    ),
    ( Name : 'purple'; Color: clFuchsia ),
    ( Name : 'yellow'; Color: clYellow  ),
    ( Name : 'cyan'  ; Color: clAqua    )
  );

Procedure ParseTag( const Text: string;
                    Var Tag: TTag );
var
  TagType: TTagType;
  TagTypeText: string;
  SpacePos: longint;
begin
  SpacePos := Pos( ' ', Text );
  if SpacePos <> 0 then
  begin
    Tag.Arguments := trim( Copy( Text, SpacePos + 1, 255 ) );
    TagTypeText := LowerCase( Copy( Text, 1, SpacePos - 1 ) );
  end
  else
  begin
    Tag.Arguments := ''; // to save time copying when not needed
    TagTypeText := LowerCase( Text );
  end;

  for TagType := ttBold to ttEnd do
  begin
    if TagStr[ TagType ] = TagTypeText  then
    begin
      Tag.TagType := TagType;
      exit;
    end;
  end;

  // not found
  Tag.TagType := ttInvalid;
end;

var
  TagText: string;
  TagArgText: string;

Function ExtractTag( Var TextPointer: PChar ): TTag;
var
  CurrentChar: Char;
  TagTooLong: boolean;
  InQuote: boolean;
begin
//  assert( TextPointer[ 0 ] = '<' );
  TagText := '';
  TagTooLong := false;
  InQuote := false;

  repeat
    CurrentChar := TextPointer^;

    if     ( CurrentChar = '>' )
       and ( not InQuote ) then
    begin
      // found tag end.
      if TagTooLong then
        Result.TagType := ttInvalid
      else
        ParseTag( TagText, Result );
      inc( TextPointer );
      exit;
    end;

    if CurrentChar = #0 then
    begin
      // if we reach here we have reached the end of text
      // during a tag. invalid tag.
      Result.TagType := ttInvalid;
      exit;
    end;

    if CurrentChar = CharDoubleQuote then
    begin
      if not InQuote then
      begin
        InQuote := true
      end
      else
      begin
        // Could be escaped quote ""
//        if (TextPointer + 1 )^ = DoubleQuote then
        if ( TextPointer + 1 ) ^ = CharDoubleQuote then
        begin
          // yes it is
          inc( TextPointer ); // skip second one
        end
        else
        begin
          // no, not an escaped quote
          InQuote := false;
        end;
      end;

    end;

    if not TagTooLong then
      if Length( TagText ) < 200 then
        TagText := TagText + CurrentChar
      else
        TagTooLong := true; // but keep going until the end

    inc( TextPointer );
  until false;

end;

// Expects textpointer is currently pointing at the >
Function ExtractPreviousTag( const TextStart: PChar;
                             Var TextPointer: PChar ): TTag;
var
  CurrentChar: Char;
  TagTooLong: boolean;
  InQuote: boolean;
begin
  TagText := '';
  TagTooLong := false;
  InQuote := false;

  repeat
    dec( TextPointer );
    if TextPointer < TextStart then
    begin
      // if we reach here we have reached the end of text
      // during a tag. invalid tag.
      Result.TagType := ttInvalid;
      exit;
    end;
    CurrentChar := TextPointer^;

    if     ( CurrentChar = '<' )
       and ( not InQuote ) then
    begin
      // found tag end.
      if TagTooLong then
        Result.TagType := ttInvalid
      else
        ParseTag( TagText, Result );
      exit;
    end;

    if CurrentChar = CharDoubleQuote then
    begin
      if not InQuote then
      begin
        InQuote := true
      end
      else
      begin
        // Could be escaped quote ""
        if TextPointer <= TextStart then
        begin
          // start of text... somethin weird
          InQuote := false;
        end
        else if ( TextPointer - 1 ) ^ = CharDoubleQuote then
        begin
          // yes it is
          dec( TextPointer ); // skip second one
        end
        else
        begin
          // no, not an escaped quote
          InQuote := false;
        end;
      end;

    end;

    if not TagTooLong then
      if Length( TagText ) < 200 then
        TagText := CurrentChar + TagText
      else
        TagTooLong := true; // but keep going until the end

  until false;

end;

function ExtractNextTextElement( TextPointer: PChar; Var NextElement: PChar ): TTextElement;
var
  TheChar: TfpgChar;
  NextChar: TfpgChar;
  CharLength: integer;
  s1, s2: TfpgString;
  c: Char;
begin
  with Result do
  begin
    CharLength := UTF8CharacterLength(TextPointer);
    s1 := TextPointer;
    TheChar := UTF8Copy(s1, 1, 1);
    c := TextPointer^;
    Character := TheChar;
    inc( TextPointer, CharLength );

    if (TheChar = ' ') and (c <> #0) then
    begin
      // ---- Space (word break) found ----
      ElementType := teWordBreak;
    end
    else if (TheChar = #10) or (TheChar = #13) then
    begin
      // ---- End of line found ----
      begin
        ElementType := teLineBreak;
        if TheChar = #13 then
        begin
          TheChar := TextPointer^;
          if TheChar = #10 then
            // skip CR following LF
            inc( TextPointer );
        end;
      end;
    end
    else if (TheChar = '') and (c = #0) then
    begin
      // ---- end of text found ----
      ElementType := teTextEnd;
    end
    else if TheChar = '<' then
    begin
      // ---- tag found? ----
      CharLength := UTF8CharacterLength(TextPointer);
      s2 := TextPointer;
      NextChar := UTF8Copy(s2, 1, 1);
      if NextChar = '<' then
      begin
        // no, just a literal <
        ElementType := teText;
        inc( TextPointer );
      end
      else
      begin
        Tag := ExtractTag( TextPointer );
        if Tag.TagType = ttImage then
          ElementType := teImage
        else
          ElementType := teStyle;
      end;
    end
    else if TheChar = '>' then
    begin
      // check - should be double
      begin
        ElementType := teText;
        NextChar := TextPointer^;
        if NextChar = '>' then
          inc( TextPointer );
      end;
    end
    else
      ElementType := teText;

  end; // with
  NextElement := TextPointer;
end;

function ExtractPreviousTextElement( const TextStart: PChar;
                                     TextPointer: PChar;
                                     Var NextElement: PChar ): TTextElement;
var
  TheChar: Char;
  PreviousChar: Char;
  FoundTag: boolean;
begin
  with Result do
  begin
    dec( TextPointer );
    TheChar := TextPointer^;
    Character := TheChar;
    if TextPointer < TextStart then
    begin
      ElementType := teTextEnd;
      exit;
    end;

    case TheChar of
      ' ': // ---- Space (word break) found ----
        ElementType := teWordBreak;

      #10, #13: // ---- End of line found ----
      begin
        ElementType := teLineBreak;
        if TheChar = #10 then
        begin
          dec( TextPointer );
          TheChar := TextPointer^;
          if TheChar = #13 then
          begin
            // skip CR preceeding LF
          end
          else
            inc( TextPointer );
        end;
      end;

      '>': // ---- tag found ----
      begin
        FoundTag := true;
        if TextPointer > TextStart then
        begin
          PreviousChar := ( TextPointer - 1 )^;
          if PreviousChar = '>' then
          begin
            // no. just a literal >
            FoundTag := false;
            ElementType := teText;
            dec( TextPointer );
          end
        end;

        if FoundTag then
        begin
          Tag := ExtractPreviousTag( TextStart, TextPointer );
          if Tag.TagType = ttImage then
            ElementType := teImage
          else
            ElementType := teStyle;
        end;
      end;

      '<': // should be double
      begin
        ElementType := teText;
        if TextPointer > TextStart then
        begin
          PreviousChar := TextPointer^;
          if PreviousChar = '<' then
            dec( TextPointer );
        end;
      end
      else
        ElementType := teText;
    end;
  end; // with
  NextElement := TextPointer;
end;

function GetTagColor( const ColorParam: string;
                      var Color: TfpgColor ): boolean;
var
  ColorIndex: longint;
begin
  Result := false;
  if ColorParam <> '' then
  begin
    if ColorParam[ 1 ] = '#' then
    begin
      try
        Color := HexToInt( StrRightFrom( ColorParam, 2 ) );
        Result := true;
      except
      end;
    end
    else
    begin
      for ColorIndex := 0 to High( StandardColors ) do
      begin
        if StringsSame( ColorParam, StandardColors[ ColorIndex ].Name ) then
        begin
          Color := StandardColors[ ColorIndex ].Color;
          Result := true;
          break;
        end;
      end;
    end;
  end;
end;

function GetTagTextAlignment( const AlignParam: string;
                              const Default: TTextAlignment ): TTextAlignment;
begin
  if StringsSame( AlignParam, 'left' ) then
    Result := taLeft
  else if StringsSame( AlignParam, 'center' ) then
    Result := taCenter
  else if StringsSame( AlignParam, 'right' ) then
    Result := taRight
  else
    Result := Default;
end;

function GetTagTextWrap( const WrapParam: string ): boolean;
begin
  Result := StringsSame( WrapParam, 'yes' );
end;

function RichTextFindString( pRichText: PChar;
                             const Text: string;
                             var pMatch: PChar;
                             var MatchLength: longint ): boolean;
var
  P: PChar;
  NextP: PChar;
  Element: TTextElement;
  pMatchStart: pchar;
  pMatchStartNext: pchar;
  MatchIndex: longint;
  C: Char;
begin
  if Length( Text ) = 0 then
  begin
    // null string always matches
    Result := true;
    pMatch := pRichText;
    MatchLength := 0;
    exit;
  end;

  P := pRichText;
  MatchIndex := 1;

  // Now search, case insensitively
  while true do
  begin
    Element := ExtractNextTextElement( P, NextP );

    case Element.ElementType of
      teTextEnd:
        // end of text
        break;

      teImage,
      teLineBreak:
        // breaks a potential match
        MatchIndex := 1;

      teStyle:
        ; // ignore, matches can continue

      else
      begin
        if Uppercase(Element.Character) = UpperCase(Text[Matchindex]) then
        begin
          // found a match
          if MatchIndex = 1 then
          begin
            pMatchStart := P; // store start of match
            pMatchStartNext := NextP;
          end;

          inc( MatchIndex );
          if MatchIndex > Length( Text ) then
          begin
            // found a complete match
            Result := true;
            pMatch := pMatchStart;
            MatchLength := PCharDiff( P, pMatchStart )
                           + 1; // include this char
            exit;
          end;
        end
        else
        begin
          // not a match
          if MatchIndex > 1 then
          begin
            // go back to start of match, + 1
            NextP := pMatchStartNext;
            MatchIndex := 1;
          end;
        end;
      end;
    end;

    P := NextP;
  end;

  // no match found
  Result := false;
  pMatch := nil;
  MatchLength := 0;
end;

function RichTextWordLeft( pRichText: PChar;
                           pStart: PChar ): PChar;
Var
  P: PChar;
  NextP: PChar;
  Element: TTextElement;
begin
  P := pStart;

  // skip whitespace/tags...
  Element := ExtractPreviousTextElement( pRichText, P, NextP );
  P := NextP;
  while Element.ElementType in [ teWordBreak, teLineBreak, teImage, teStyle ] do
  begin
    Element := ExtractPreviousTextElement( pRichText, P, NextP );
    P := NextP;
  end;
  if Element.ElementType = teTextEnd then
  begin
    Result := P;
    // out of text
    exit;
  end;

  // back to start of word, skip text/tags
  while true do
  begin
    Element := ExtractPreviousTextElement( pRichText, P, NextP );
    if not ( Element.ElementType in [ teText, teStyle ] ) then
      break;
    P := NextP;
  end;
  Result := P;
end;

function RichTextWordRight( pStart: PChar ): PChar;
Var
  P: PChar;
  NextP: PChar;
  Element: TTextElement;
begin
  P := pStart;

  // skip text/tags...
  Element := ExtractNextTextElement( P, NextP );
  while Element.ElementType in [ teStyle, teText ] do
  begin
    P := NextP;
    Element := ExtractNextTextElement( P, NextP );
  end;
  if Element.ElementType <> teTextEnd then
  begin
    // skip whitespace
    Element := ExtractNextTextElement( P, NextP );
    while Element.ElementType in [ teWordBreak, teLineBreak, teImage, teStyle ] do
    begin
      P := NextP;
      Element := ExtractNextTextElement( P, NextP );
    end;
  end;

  Result := P;
end;

function RichTextWordAt( pRichText: PChar;
                         pStart: PChar;
                         Var pWordStart: PChar;
                         Var WordLength: longint ): boolean;
Var
  P: PChar;
  NextP: PChar;
  Element: TTextElement;
  pWordEnd: PChar;
begin
  P := pStart;
  Element := ExtractNextTextElement( P, NextP );
  if not ( Element.ElementType in [ teStyle, teText ] ) then
  begin
    // not in a word.
    result := false;
    pWordStart := nil;
    WordLength := 0;
    exit;
  end;
  // find end of the word
  while Element.ElementType in [ teStyle, teText ] do
  begin
    P := NextP;
    Element := ExtractNextTextElement( P, NextP );
  end;
  pWordEnd := P;

  P := pStart;
  Element := ExtractPreviousTextElement( pRichText, P, NextP );
  while Element.ElementType in [ teStyle, teText ] do
  begin
    P := NextP;
    Element := ExtractPreviousTextElement( pRichText, P, NextP );
  end;
  pWordStart := P;
  WordLength := PCharDiff( pWordEnd, pWordStart );
  Result := true;
end;

// TODO: Rewrite this to work with fpGUI and AnsiString/TfpgString
function CopyPlainTextToBuffer( StartP: PChar;
                                EndP: PChar;
                                Buffer: PChar;
                                BufferLength: longint ): longint;
var
  Q: PChar;
  EndQ: Pchar;
  P: PChar;
  NextP: PChar;
  Element: TTextElement;
begin
 (*
  P := StartP;
  Q := Buffer;
  EndQ := Buffer + BufferLength;

  while P < EndP do
  begin
    Element := ExtractNextTextElement( P, NextP );
    case Element.ElementType of
      teText, teWordBreak:
      begin
        // copy char
        if Buffer <> nil then
          Q[ 0 ] := Element.Character;
        inc( Q );
      end;

      teLineBreak:
      begin
        if Buffer <> nil then
          Q[ 0 ] := #13;
        inc( Q );
        if Q = EndQ then
          // end of buffer
          break;

        if Buffer <> nil then
          Q[ 0 ] := #10;
        inc( Q );
      end;
    end;

   if Q = EndQ then
     // end of buffer
     break;

    P := NextP;
  end;
  result := PCharDiff( Q, Buffer );
  *)
end;

Initialization
End.