Class String
In: lib/feedparser/text-output.rb
lib/feedparser/textconverters.rb
Parent: Object
String\n[lib/feedparser/text-output.rb\nlib/feedparser/textconverters.rb] dot/f_8.png

This class provides various converters

Methods

Constants

MY_ENTITIES = {}

Public Instance methods

[Source]

    # File lib/feedparser/textconverters.rb, line 17
17:   def escape_html
18:     r = self.gsub('&', '&')
19:     r = r.gsub('<', '&lt;')
20:     r = r.gsub('>', '&gt;')
21:     r
22:   end

returns true if the text contains escaped HTML (with HTML entities). used by String#text2html

[Source]

    # File lib/feedparser/textconverters.rb, line 13
13:   def escaped_html?
14:     return (self =~ /&lt;img src=/i) || (self =~ /&lt;a href=/i) || (self =~ /&lt;br(\/| \/|)&gt;/i) || (self =~ /&lt;p&gt;/i)
15:   end

Convert an HTML text to plain text

[Source]

    # File lib/feedparser/text-output.rb, line 7
 7:   def html2text(wrapto = false)
 8:     text = self.clone
 9:     # parse HTML
10:     p = FeedParser::HTML2TextParser::new(true)
11:     p.feed(text)
12:     p.close
13:     text = p.savedata
14:     # remove leading and trailing whilespace
15:     text.gsub!(/\A\s*/m, '')
16:     text.gsub!(/\s*\Z/m, '')
17:     # remove whitespace around \n
18:     text.gsub!(/ *\n/m, "\n")
19:     text.gsub!(/\n */m, "\n")
20:     # and duplicates \n
21:     text.gsub!(/\n\n+/m, "\n\n")
22:     # and remove duplicated whitespace
23:     text.gsub!(/[ \t]+/, ' ')
24: 
25:     # finally, wrap the text if requested
26:     return wrap_text(text, wrapto) if wrapto
27:     text
28:   end

is this text HTML ? search for tags. used by String#text2html

[Source]

    # File lib/feedparser/textconverters.rb, line 8
 8:   def html?
 9:     return (self =~ /<p>/i) || (self =~ /<\/p>/i) || (self =~ /<br>/i) || (self =~ /<br\s*(\/)?\s*>/i) || (self =~ /<\/a>/i) || (self =~ /<img.*>/i)
10:   end

Remove white space around the text

[Source]

    # File lib/feedparser/textconverters.rb, line 95
95:   def rmWhiteSpace!
96:     return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'')
97:   end

convert text to HTML

[Source]

    # File lib/feedparser/textconverters.rb, line 40
40:   def text2html(feed)
41:     text = self.clone
42:     realhtml = text.html?
43:     eschtml = text.escaped_html?
44:     # fix for RSS feeds with both real and escaped html (crazy!):
45:     # we take the first one
46:     if (realhtml && eschtml)
47:       if (realhtml < eschtml)
48:         eschtml = nil
49:       else
50:         realhtml = nil
51:       end
52:     end
53:     if realhtml
54:       # do nothing
55:     elsif eschtml
56:       text = text.unescape_html
57:     else
58:       # paragraphs
59:       text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>')
60:       text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>")
61:       # uris
62:       text.gsub!(/([^'"])(#{URI::regexp(['http','ftp','https'])})/,
63:           '\1<a href="\2">\2</a>')
64:     end
65:     # Handle broken hrefs in <a> and <img>
66:     if feed and feed.link
67:       text.gsub!(/(\s(src|href)=['"])([^'"]*)(['"])/) do |m|
68:         begin
69:           first, url, last = $1, $3, $4
70:           if (url =~ /^\s*\w+:\/\//) or (url =~ /^\s*\w+:\w/)
71:             m
72:           elsif url =~ /^\//
73:             (first + feed.link.split(/\//)[0..2].join('/') + url + last)
74:           else
75:             t = feed.link.split(/\//)
76:             if t.length == 3 # http://toto with no trailing /
77:               (first + feed.link + '/' + url + last)
78:             else
79:               if feed.link =~ /\/$/
80:                 (first + feed.link + url + last)
81:               else
82:                 (first + t[0...-1].join('/') + '/' + url + last)
83:               end
84:             end
85:           end
86:         rescue
87:           m
88:         end
89:       end
90:     end
91:     text
92:   end

Convert a text in inputenc to a text in UTF8 must take care of wrong input locales

[Source]

     # File lib/feedparser/textconverters.rb, line 101
101:   def toUTF8(inputenc)
102:     if inputenc.downcase != 'utf-8'
103:       # it is said it is not UTF-8. Ensure it is REALLY not UTF-8
104:       begin
105:         if self.unpack('U*').pack('U*') == self
106:           return self
107:         end
108:       rescue
109:         # do nothing
110:       end
111:       begin
112:         return self.unpack('C*').pack('U*')
113:       rescue
114:         return self #failsafe solution. but a dirty one :-)
115:       end
116:     else
117:       return self
118:     end
119:   end

un-escape HTML in the text. used by String#text2html

[Source]

    # File lib/feedparser/textconverters.rb, line 31
31:   def unescape_html
32:     r = self
33:     MY_ENTITIES.each do |k, v|
34:       r = r.gsub(k, v)
35:     end
36:     r
37:   end

[Source]

    # File lib/feedparser/text-output.rb, line 30
30:   def wrap_text(text, wrapto = 72)
31:     text.gsub(/(.{1,#{wrapto}})( +|$)\n?/, "\\1\\2\n")
32:   end

[Validate]