# A parser for SGML, using the derived class as static DTD.

# Taken from http://raa.ruby-lang.org/list.rhtml?name=html-parser-2
# This file seems to be included in the current install of Ruby,
#  but with a bug related to attributes quoted with '"', so I have
#  included in in the HTML package of this distribution


module HTML

    class SGMLParser

      # Regular expressions used for parsing:
      Interesting = /[&<]/
      Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
                                  '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
                                  '![^<>]*)?')

      Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
      Charref = /&#([0-9]+)[^0-9]/

      Starttagopen = /<[>a-zA-Z]/
      Endtagopen = /<\/[<>a-zA-Z]/
      Endbracket = /[<>]/
      Special = /<![^<>]*>/
      Commentopen = /<!--/
      Commentclose = /--[ \t\n]*>/
      Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
      Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
                                '(\s*=\s*' +
                                "('[^']*'" +
                                '|"[^"]*"' +
                                '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
      Endtagfind = /\s*\/\s*>/
      Entitydefs =
        {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}

      def initialize(verbose=false)
        @verbose = verbose
        reset
      end

      def reset
        @rawdata = ''
        @stack = []
        @lasttag = '???'
        @nomoretags = false
        @literal = false
      end

      def has_context(gi)
        @stack.include? gi
      end

      def setnomoretags
        @nomoretags = true
        @literal = true
      end

      def setliteral(*args)
        @literal = true
      end

      def feed(data)
        @rawdata << data
        goahead(false)
      end

      def close
        goahead(true)
      end

      def goahead(_end)
        rawdata = @rawdata
        i = 0
        n = rawdata.length
        while i < n
          if @nomoretags
            handle_data(rawdata[i..(n-1)])
            i = n
            break
          end
          j = rawdata.index(Interesting, i)
          j = n unless j
          if i < j
            handle_data(rawdata[i..(j-1)])
          end
          i = j
          break if (i == n)
          if rawdata[i] == ?< #
            if rawdata.index(Starttagopen, i) == i
              if @literal
                handle_data(rawdata[i, 1])
                i += 1
                next
              end
              k = parse_starttag(i)
              break unless k
              i = k
              next
            end
            if rawdata.index(Endtagopen, i) == i
              k = parse_endtag(i)
              break unless k
              i = k
              @literal = false
              next
            end
            if rawdata.index(Commentopen, i) == i
              if @literal
                handle_data(rawdata[i,1])
                i += 1
                next
              end
              k = parse_comment(i)
              break unless k
              i += k
              next
            end
            if rawdata.index(Special, i) == i
              if @literal
                handle_data(rawdata[i, 1])
                i += 1
                next
              end
              k = parse_special(i)
              break unless k
              i += k
              next
            end
          elsif rawdata[i] == ?& #
            if rawdata.index(Charref, i) == i
              i += $&.length
              handle_charref($1)
              i -= 1 unless rawdata[i-1] == ?;
              next
            end
            if rawdata.index(Entityref, i) == i
              i += $&.length
              handle_entityref($1)
              i -= 1 unless rawdata[i-1] == ?;
              next
            end
          else
            raise RuntimeError, 'neither < nor & ??'
          end
          # We get here only if incomplete matches but
          # nothing else
          match = rawdata.index(Incomplete, i)
          unless match == i
            handle_data(rawdata[i, 1])
            i += 1
            next
          end
          j = match + $&.length
          break if j == n # Really incomplete
          handle_data(rawdata[i..(j-1)])
          i = j
        end
        # end while
        if _end and i < n
          handle_data(@rawdata[i..(n-1)])
          i = n
        end
        @rawdata = rawdata[i..-1]
      end

      def parse_comment(i)
        rawdata = @rawdata
        if rawdata[i, 4] != '<!--'
          raise RuntimeError, 'unexpected call to handle_comment'
        end
        match = rawdata.index(Commentclose, i)
        return nil unless match
        matched_length = $&.length
        j = match
        handle_comment(rawdata[i+4..(j-1)])
        j = match + matched_length
        return j-i
      end

      def parse_starttag(i)
        rawdata = @rawdata
        j = rawdata.index(Endbracket, i + 1)
        return nil unless j
        attrs = []
        if rawdata[i+1] == ?> #
          # SGML shorthand: <> == <last open tag seen>
          k = j
          tag = @lasttag
        else
          match = rawdata.index(Tagfind, i + 1)
          unless match
            raise RuntimeError, 'unexpected call to parse_starttag'
          end
          k = i + 1 + ($&.length)
          tag = $&.downcase
          @lasttag = tag
        end
        while k < j
          break unless rawdata.index(Attrfind, k)
          matched_length = $&.length
          break unless k+matched_length<=j
          attrname, rest, attrvalue = $1, $2, $3
          if not rest
            attrvalue = '' # was: = attrname
          elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
              (attrvalue[0] == ?" && attrvalue[-1] == ?")
            attrvalue = attrvalue[1..-2]
          end
          attrs << [attrname.downcase, attrvalue]
          k += matched_length
        end
        if rawdata[j] == ?> #
          j += 1
        end
        finish_starttag(tag, attrs)
        return j
      end

      def parse_endtag(i)
        rawdata = @rawdata
        j = rawdata.index(Endbracket, i + 1)
        return nil unless j
        tag = (rawdata[i+2..j-1].strip).downcase
        if rawdata[j] == ?> #
          j += 1
        end
        finish_endtag(tag)
        return j
      end

      def finish_starttag(tag, attrs)
        method = 'start_' + tag
        if self.respond_to?(method)
          @stack << tag
          handle_starttag(tag, method, attrs)
          return 1
        else
          method = 'do_' + tag
          if self.respond_to?(method)
            handle_starttag(tag, method, attrs)
            return 0
          else
            unknown_starttag(tag, attrs)
            return -1
          end
        end
      end

      def finish_endtag(tag)
        if tag == ''
          found = @stack.length - 1
          if found < 0
            unknown_endtag(tag)
            return
          end
        else
          unless @stack.include? tag
            method = 'end_' + tag
            unless self.respond_to?(method)
              unknown_endtag(tag)
            end
            return
          end
          found = @stack.index(tag) #or @stack.length
        end
        while @stack.length > found
          tag = @stack[-1]
          method = 'end_' + tag
          if respond_to?(method)
            handle_endtag(tag, method)
          else
            unknown_endtag(tag)
          end
          @stack.pop
        end
      end

      def parse_special(i)
        rawdata = @rawdata
        match = rawdata.index(Endbracket, i+1)
        return nil unless match
        matched_length = $&.length
        handle_special(rawdata[i+1..(match-1)])
        return match - i + matched_length
      end

      def handle_starttag(tag, method, attrs)
        self.send(method, attrs)
      end

      def handle_endtag(tag, method)
        self.send(method)
      end

      def report_unbalanced(tag)
        if @verbose
          print '*** Unbalanced </' + tag + '>', "\n"
          print '*** Stack:', self.stack, "\n"
        end
      end

      def handle_charref(name)
        n = Integer(name)
        if !(0 <= n && n <= 255)
          unknown_charref(name)
          return
        end
        handle_data(n.chr)
      end

      def handle_entityref(name)
        table = Entitydefs
        if table.include?(name)
          handle_data(table[name])
        else
          unknown_entityref(name)
          return
        end
      end

      def handle_data(data)
      end

      def handle_comment(data)
      end

      def handle_special(data)
      end

      def unknown_starttag(tag, attrs)
      end
      def unknown_endtag(tag)
      end
      def unknown_charref(ref)
      end
      def unknown_entityref(ref)
      end

    end
end
