require 'hpricot/htmlinfo' def Hpricot(input = nil, opts = {}, &blk) Hpricot.parse(input, opts, &blk) end module Hpricot # Exception class used for any errors related to deficiencies in the system when # handling the character encodings of a document. class EncodingError < StandardError; end # Hpricot.parse parses input and return a document tree. # represented by Hpricot::Doc. def Hpricot.parse(input = nil, opts = {}, &blk) Doc.new(make(input, opts, &blk)) end # Hpricot::XML parses input, disregarding all the HTML rules # and returning a document tree. def Hpricot.XML(input, opts = {}) Doc.new(make(input, opts.merge(:xml => true))) end # :stopdoc: def Hpricot.make(input = nil, opts = {}, &blk) opts = {:fixup_tags => false}.merge(opts) unless input or blk raise ArgumentError, "An Hpricot document must be built from an input source (a String) or a block." end conv = opts[:xml] ? :to_s : :downcase fragment = if input case opts[:encoding] when nil when 'utf-8' unless defined? Encoding::Character::UTF8 raise EncodingError, "The ruby-character-encodings library could not be found for utf-8 mode." end else raise EncodingError, "No encoding option `#{opts[:encoding]}' is available." end if opts[:xhtml_strict] opts[:fixup_tags] = true end stack = [[nil, nil, [], [], [], []]] Hpricot.scan(input) do |token| if stack.last[5] == :CDATA and ![:procins, :comment, :cdata].include?(token[0]) and !(token[0] == :etag and token[1].casecmp(stack.last[0]).zero?) token[0] = :text token[1] = token[3] if token[3] end if !opts[:xml] and token[0] == :emptytag token[1] = token[1].send(conv) if ElementContent[token[1].downcase] != :EMPTY token[0] = :stag end end # TODO: downcase instead when parsing attributes? if !opts[:xml] and token[2].is_a?(Hash) token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh } end case token[0] when :stag case opts[:encoding] when 'utf-8' token.map! { |str| u(str) if str.is_a? String } end stagname = token[0] = token[1] = token[1].send(conv) if ElementContent[stagname] == :EMPTY and !opts[:xml] token[0] = :emptytag stack.last[2] << token else unless opts[:xml] if opts[:fixup_tags] # obey the tag rules set up by the current element if ElementContent.has_key? stagname trans = nil (stack.length-1).downto(0) do |i| untags = stack[i][5] break unless untags.include? stagname # puts "** ILLEGAL #{stagname} IN #{stack[i][0]}" trans = i end if trans.to_i > 1 eles = stack.slice!(trans..-1) stack.last[2] += eles # puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}" end elsif opts[:xhtml_strict] token[2] = {'class' => stagname} stagname = token[0] = "div" end end # setup tag rules for inside this element if ElementContent[stagname] == :CDATA uncontainable_tags = :CDATA elsif opts[:fixup_tags] possible_tags = ElementContent[stagname] excluded_tags, included_tags = stack.last[3..4] if possible_tags excluded_tags = excluded_tags | (ElementExclusions[stagname] || []) included_tags = included_tags | (ElementInclusions[stagname] || []) containable_tags = (possible_tags | included_tags) - excluded_tags uncontainable_tags = ElementContent.keys - containable_tags else # If the tagname is unknown, it is assumed that any element # except excluded can be contained. uncontainable_tags = excluded_tags end end end unless opts[:xml] case token[2] when Hash token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh } end end stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags] end when :etag etagname = token[0] = token[1].send(conv) if opts[:xhtml_strict] and not ElementContent.has_key? etagname etagname = token[0] = "div" end matched_elem = nil (stack.length-1).downto(0) do |i| stagname, = stack[i] if stagname == etagname matched_elem = stack[i] stack[i][1] += token eles = stack.slice!((i+1)..-1) stack.last[2] += eles break end end unless matched_elem stack.last[2] << [:bogus_etag, token.first, token.last] else ele = stack.pop stack.last[2] << ele end when :text l = stack.last[2].last if l and l[0] == :text l[1] += token[1] else stack.last[2] << token end else stack.last[2] << token end end while 1 < stack.length ele = stack.pop stack.last[2] << ele end structure_list = stack[0][2] structure_list.map {|s| build_node(s, opts) } elsif blk Hpricot.build(&blk).children end end def Hpricot.build_node(structure, opts = {}) case structure[0] when String tagname, _, attrs, sraw, _, _, _, eraw = structure[1] children = structure[2] etag = eraw && ETag.parse(tagname, eraw) stag = STag.parse(tagname, attrs, sraw, true) if !children.empty? || etag Elem.new(stag, children.map {|c| build_node(c, opts) }, etag) else Elem.new(stag) end when :text Text.parse_pcdata(structure[1]) when :emptytag Elem.new(STag.parse(structure[1], structure[2], structure[3], false)) when :bogus_etag BogusETag.parse(structure[1], structure[2]) when :xmldecl XMLDecl.parse(structure[2], structure[3]) when :doctype if opts[:xhtml_strict] structure[2]['system_id'] = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" structure[2]['public_id'] = "-//W3C//DTD XHTML 1.0 Strict//EN" end DocType.parse(structure[1], structure[2], structure[3]) when :procins ProcIns.parse(structure[1]) when :comment Comment.parse(structure[1]) when :cdata_content Text.parse_cdata_content(structure[1]) when :cdata Text.parse_cdata_section(structure[1]) else raise Exception, "[bug] unknown structure: #{structure.inspect}" end end def STag.parse(qname, attrs, raw_string, is_stag) result = STag.new(qname, attrs) result.raw_string = raw_string result end def ETag.parse(qname, raw_string) result = self.new(qname) result.raw_string = raw_string result end def BogusETag.parse(qname, raw_string) result = self.new(qname) result.raw_string = raw_string result end def Text.parse_pcdata(raw_string) result = Text.new(raw_string) result end def Text.parse_cdata_content(raw_string) result = CData.new(raw_string) result end def Text.parse_cdata_section(content) result = CData.new(content) result end def XMLDecl.parse(attrs, raw_string) attrs ||= {} version = attrs['version'] encoding = attrs['encoding'] case attrs['standalone'] when 'yes' standalone = true when 'no' standalone = false else standalone = nil end result = XMLDecl.new(version, encoding, standalone) result.raw_string = raw_string result end def DocType.parse(root_element_name, attrs, raw_string) if attrs public_identifier = attrs['public_id'] system_identifier = attrs['system_id'] end root_element_name = root_element_name.downcase result = DocType.new(root_element_name, public_identifier, system_identifier) result.raw_string = raw_string result end def ProcIns.parse(raw_string) _, target, content = *raw_string.match(/\A<\?(\S+)\s+(.+)/m) result = ProcIns.new(target, content) result end def Comment.parse(content) result = Comment.new(content) result end module Pat NameChar = /[-A-Za-z0-9._:]/ Name = /[A-Za-z_:]#{NameChar}*/ Nmtoken = /#{NameChar}+/ end # :startdoc: end