Class FeedParser::SGMLParser
In: lib/feedparser/sgml-parser.rb
Parent: Object
RuntimeError UnknownFeedTypeException FeedItem\n[lib/feedparser/feedparser.rb\nlib/feedparser/html-output.rb\nlib/feedparser/text-output.rb] AtomItem RSSItem SGMLParser HTML2TextParser Feed\n[lib/feedparser/feedparser.rb\nlib/feedparser/html-output.rb\nlib/feedparser/text-output.rb] lib/feedparser/feedparser.rb lib/feedparser/sgml-parser.rb lib/feedparser/text-output.rb lib/feedparser/html2text-parser.rb FeedParser dot/m_7_0.png

Methods

Constants

Interesting = /[&<]/   Regular expressions used for parsing:
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + '![^<>]*)?')
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*);/
Charref = /&#([0-9]+);/
Starttagopen = /<[>a-zA-Z]/
Endtagopen = /<\/[<>a-zA-Z]/
Endbracket = /[<>]/
Special = /<![^<>]*>/
Commentopen = /<!--/
Commentclose = /--[ \t\n]*>/
Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' + '(\s*=\s*' + "('[^']*'" + '|"[^"]*"' + '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
Entitydefs = {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}

Public Class methods

[Source]

    # File lib/feedparser/sgml-parser.rb, line 30
30:     def initialize(verbose=false)
31:       @verbose = verbose
32:       reset
33:     end

Public Instance methods

[Source]

    # File lib/feedparser/sgml-parser.rb, line 61
61:     def close
62:       goahead(true)
63:     end

[Source]

    # File lib/feedparser/sgml-parser.rb, line 56
56:     def feed(data)
57:       @rawdata << data
58:       goahead(false)
59:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 242
242:     def finish_endtag(tag)
243:       if tag == ''
244:         found = @stack.length - 1
245:         if found < 0
246:           unknown_endtag(tag)
247:           return
248:         end
249:       else
250:         unless @stack.include? tag
251:           method = 'end_' + tag
252:           unless self.respond_to?(method)
253:             unknown_endtag(tag)
254:           end
255:           return
256:         end
257:         found = @stack.index(tag) #or @stack.length
258:       end
259:       while @stack.length > found
260:         tag = @stack[-1]
261:         method = 'end_' + tag
262:         if respond_to?(method)
263:           handle_endtag(tag, method)
264:         else
265:           unknown_endtag(tag)
266:         end
267:         @stack.pop
268:       end
269:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 224
224:     def finish_starttag(tag, attrs)
225:       method = 'start_' + tag
226:       if self.respond_to?(method)
227:         @stack << tag
228:         handle_starttag(tag, method, attrs)
229:         return 1
230:       else
231:         method = 'do_' + tag
232:         if self.respond_to?(method)
233:           handle_starttag(tag, method, attrs)
234:           return 0
235:         else
236:           unknown_starttag(tag, attrs)
237:           return -1
238:         end
239:       end
240:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 65
 65:     def goahead(_end)
 66:       rawdata = @rawdata
 67:       i = 0
 68:       n = rawdata.length
 69:       while i < n
 70:         if @nomoretags
 71:           handle_data(rawdata[i..(n-1)])
 72:           i = n
 73:           break
 74:         end
 75:         j = rawdata.index(Interesting, i)
 76:         j = n unless j
 77:         if i < j
 78:           handle_data(rawdata[i..(j-1)])
 79:         end
 80:         i = j
 81:         break if (i == n)
 82:         if rawdata[i] == ?< #
 83:           if rawdata.index(Starttagopen, i) == i
 84:             if @literal
 85:               handle_data(rawdata[i, 1])
 86:               i += 1
 87:               next
 88:             end
 89:             k = parse_starttag(i)
 90:             break unless k
 91:             i = k
 92:             next
 93:           end
 94:           if rawdata.index(Endtagopen, i) == i
 95:             k = parse_endtag(i)
 96:             break unless k
 97:             i = k
 98:             @literal = false
 99:             next
100:           end
101:           if rawdata.index(Commentopen, i) == i
102:             if @literal
103:               handle_data(rawdata[i,1])
104:               i += 1
105:               next
106:             end
107:             k = parse_comment(i)
108:             break unless k
109:             i += k
110:             next
111:           end
112:           if rawdata.index(Special, i) == i
113:             if @literal
114:               handle_data(rawdata[i, 1])
115:               i += 1
116:               next
117:             end
118:             k = parse_special(i)
119:             break unless k
120:             i += k
121:             next
122:           end
123:         elsif rawdata[i] == ?& #
124:           if rawdata.index(Charref, i) == i
125:             i += $&.length
126:             handle_charref($1)
127:             i -= 1 unless rawdata[i-1] == ?;
128:             next
129:           end
130:           if rawdata.index(Entityref, i) == i
131:             i += $&.length
132:             handle_entityref($1)
133:             i -= 1 unless rawdata[i-1] == ?;
134:             next
135:           end
136:         else
137:           raise RuntimeError, 'neither < nor & ??'
138:         end
139:         # We get here only if incomplete matches but
140:         # nothing else
141:         match = rawdata.index(Incomplete, i)
142:         unless match == i
143:           handle_data(rawdata[i, 1])
144:           i += 1
145:           next
146:         end
147:         j = match + $&.length
148:         break if j == n # Really incomplete
149:         handle_data(rawdata[i..(j-1)])
150:         i = j
151:       end
152:       # end while
153:       if _end and i < n
154:         handle_data(@rawdata[i..(n-1)])
155:         i = n
156:       end
157:       @rawdata = rawdata[i..-1]
158:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 295
295:     def handle_charref(name)
296:       n = name.to_i
297:       if !(0 <= n && n <= 255)
298:         unknown_charref(name)
299:         return
300:       end
301:       handle_data(n.chr)
302:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 317
317:     def handle_comment(data)
318:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 314
314:     def handle_data(data)
315:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 284
284:     def handle_endtag(tag, method)
285:       self.send(method)
286:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 304
304:     def handle_entityref(name)
305:       table = Entitydefs
306:       if table.include?(name)
307:         handle_data(table[name])
308:       else
309:         unknown_entityref(name)
310:         return
311:       end
312:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 320
320:     def handle_special(data)
321:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 280
280:     def handle_starttag(tag, method, attrs)
281:       self.send(method, attrs)
282:     end

[Source]

    # File lib/feedparser/sgml-parser.rb, line 43
43:     def has_context(gi)
44:       @stack.include? gi
45:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 160
160:     def parse_comment(i)
161:       rawdata = @rawdata
162:       if rawdata[i, 4] != '<!--'
163:         raise RuntimeError, 'unexpected call to handle_comment'
164:       end
165:       match = rawdata.index(Commentclose, i)
166:       return nil unless match
167:       matched_length = $&.length
168:       j = match
169:       handle_comment(rawdata[i+4..(j-1)])
170:       j = match + matched_length
171:       return j-i
172:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 212
212:     def parse_endtag(i)
213:       rawdata = @rawdata
214:       j = rawdata.index(Endbracket, i + 1)
215:       return nil unless j
216:       tag = (rawdata[i+2..j-1].strip).downcase
217:       if rawdata[j] == ?> #
218:         j += 1
219:       end
220:       finish_endtag(tag)
221:       return j
222:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 271
271:     def parse_special(i)
272:       rawdata = @rawdata
273:       match = rawdata.index(Endbracket, i+1)
274:       return nil unless match
275:       matched_length = $&.length
276:       handle_special(rawdata[i+1..(match-1)])
277:       return match - i + matched_length
278:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 174
174:     def parse_starttag(i)
175:       rawdata = @rawdata
176:       j = rawdata.index(Endbracket, i + 1)
177:       return nil unless j
178:       attrs = []
179:       if rawdata[i+1] == ?> #
180:         # SGML shorthand: <> == <last open tag seen>
181:         k = j
182:         tag = @lasttag
183:       else
184:         match = rawdata.index(Tagfind, i + 1)
185:         unless match
186:           raise RuntimeError, 'unexpected call to parse_starttag'
187:         end
188:         k = i + 1 + ($&.length)
189:         tag = $&.downcase
190:         @lasttag = tag
191:       end
192:       while k < j
193:         break unless rawdata.index(Attrfind, k)
194:         matched_length = $&.length
195:         attrname, rest, attrvalue = $1, $2, $3
196:         if not rest
197:           attrvalue = '' # was: = attrname
198:         elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
199:             (attrvalue[0] == ?" && attrvalue[-1,1] == ?")
200:           attrvalue = attrvalue[1..-2]
201:         end
202:         attrs << [attrname.downcase, attrvalue]
203:         k += matched_length
204:       end
205:       if rawdata[j] == ?> #
206:         j += 1
207:       end
208:       finish_starttag(tag, attrs)
209:       return j
210:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 288
288:     def report_unbalanced(tag)
289:       if @verbose
290:         print '*** Unbalanced </' + tag + '>', "\n"
291:         print '*** Stack:', self.stack, "\n"
292:       end
293:     end

[Source]

    # File lib/feedparser/sgml-parser.rb, line 35
35:     def reset
36:       @rawdata = ''
37:       @stack = []
38:       @lasttag = '???'
39:       @nomoretags = false
40:       @literal = false
41:     end

[Source]

    # File lib/feedparser/sgml-parser.rb, line 52
52:     def setliteral(*args)
53:       @literal = true
54:     end

[Source]

    # File lib/feedparser/sgml-parser.rb, line 47
47:     def setnomoretags
48:       @nomoretags = true
49:       @literal = true
50:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 327
327:     def unknown_charref(ref)
328:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 325
325:     def unknown_endtag(tag)
326:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 329
329:     def unknown_entityref(ref)
330:     end

[Source]

     # File lib/feedparser/sgml-parser.rb, line 323
323:     def unknown_starttag(tag, attrs)
324:     end

[Validate]