In Files

Parent

Class Index [+]

Quicksearch

Mechanize

Synopsis

The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL’s is maintained and can be queried.

Example

 require 'rubygems'
 require 'mechanize'
 require 'logger'

 agent = Mechanize.new { |a| a.log = Logger.new("mech.log") }
 agent.user_agent_alias = 'Mac Safari'
 page = agent.get("http://www.google.com/")
 search_form = page.form_with(:name => "f")
 search_form.field_with(:name => "q").value = "Hello"
 search_results = agent.submit(search_form)
 puts search_results.body

Constants

VERSION

The version of Mechanize you are using.

AGENT_ALIASES

User Agent aliases

Attributes

open_timeout[RW]
read_timeout[RW]
user_agent[RW]
watch_for_set[RW]
ca_file[RW]
key[RW]
cert[RW]
pass[RW]
redirect_ok[RW]
gzip_enabled[RW]
keep_alive_time[RW]
keep_alive[RW]
conditional_requests[RW]
follow_meta_refresh[RW]
verify_callback[RW]
history_added[RW]
scheme_handlers[RW]
redirection_limit[RW]
request_headers[RW]

A hash of custom request headers

proxy_addr[R]

Proxy settings

proxy_pass[R]
proxy_port[R]
proxy_user[R]
html_parser[RW]

The HTML parser to be used when parsing documents

history[R]
pluggable_parser[R]
html_parser[RW]
log[RW]

Public Class Methods

inherited(child) click to toggle source
     # File lib/mechanize.rb, line 109
109:     def inherited(child)
110:       child.html_parser ||= html_parser
111:       child.log ||= log
112:       super
113:     end
new() click to toggle source
     # File lib/mechanize.rb, line 116
116:   def initialize
117:     # attr_accessors
118:     @cookie_jar     = CookieJar.new
119:     @log            = nil
120:     @open_timeout   = nil
121:     @read_timeout   = nil
122:     @user_agent     = AGENT_ALIASES['Mechanize']
123:     @watch_for_set  = nil
124:     @history_added  = nil
125:     @ca_file        = nil # OpenSSL server certificate file
126: 
127:     # callback for OpenSSL errors while verifying the server certificate
128:     # chain, can be used for debugging or to ignore errors by always
129:     # returning _true_
130:     @verify_callback = nil
131:     @cert           = nil # OpenSSL Certificate
132:     @key            = nil # OpenSSL Private Key
133:     @pass           = nil # OpenSSL Password
134:     @redirect_ok    = true # Should we follow redirects?
135:     @gzip_enabled   = true
136: 
137:     # attr_readers
138:     @history        = Mechanize::History.new
139:     @pluggable_parser = PluggableParser.new
140: 
141:     # Auth variables
142:     @user           = nil # Auth User
143:     @password       = nil # Auth Password
144:     @digest         = nil # DigestAuth Digest
145:     @auth_hash      = {}  # Keep track of urls for sending auth
146:     @request_headers= {}  # A hash of request headers to be used
147: 
148:     # Proxy settings
149:     @proxy_addr     = nil
150:     @proxy_pass     = nil
151:     @proxy_port     = nil
152:     @proxy_user     = nil
153: 
154:     @conditional_requests = true
155: 
156:     @follow_meta_refresh  = false
157:     @redirection_limit    = 20
158: 
159:     # Connection Cache & Keep alive
160:     @connection_cache = {}
161:     @keep_alive_time  = 300
162:     @keep_alive       = true
163: 
164:     @scheme_handlers  = Hash.new { |h,k|
165:       h[k] = lambda { |link, page|
166:         raise UnsupportedSchemeError.new(k)
167:       }
168:     }
169:     @scheme_handlers['http']      = lambda { |link, page| link }
170:     @scheme_handlers['https']     = @scheme_handlers['http']
171:     @scheme_handlers['relative']  = @scheme_handlers['http']
172:     @scheme_handlers['file']      = @scheme_handlers['http']
173: 
174:     @pre_connect_hook = Chain::PreConnectHook.new
175:     @post_connect_hook = Chain::PostConnectHook.new
176: 
177:     @html_parser = self.class.html_parser
178: 
179:     yield self if block_given?
180:   end

Public Instance Methods

auth(user, password) click to toggle source

Sets the user and password to be used for authentication.

     # File lib/mechanize.rb, line 213
213:   def auth(user, password)
214:     @user       = user
215:     @password   = password
216:   end
Also aliased as: basic_auth
back() click to toggle source

Equivalent to the browser back button. Returns the most recent page visited.

     # File lib/mechanize.rb, line 320
320:   def back
321:     @history.pop
322:   end
basic_auth(user, password) click to toggle source
Alias for: auth
click(link) click to toggle source

Clicks the Mechanize::Link object passed in and returns the page fetched.

     # File lib/mechanize.rb, line 311
311:   def click(link)
312:     referer = link.page rescue referer = nil
313:     href = link.respond_to?(:href) ? link.href :
314:       (link['href'] || link['src'])
315:     get(:url => href, :referer => (referer || current_page()))
316:   end
cookies() click to toggle source

Returns a list of cookies stored in the cookie jar.

     # File lib/mechanize.rb, line 208
208:   def cookies
209:     @cookie_jar.to_a
210:   end
current_page() click to toggle source

Returns the current page loaded by Mechanize

     # File lib/mechanize.rb, line 408
408:   def current_page
409:     @history.last
410:   end
Also aliased as: page
delete(url, query_params = {}, options = {}) click to toggle source
  

DELETE to url with query_params, and setting options:

  delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
     # File lib/mechanize.rb, line 280
280:   def delete(url, query_params = {}, options = {})
281:     page = head(url, query_params, options.merge({:verb => :delete}))
282:     add_to_history(page)
283:     page
284:   end
get(options, parameters = [], referer = nil) click to toggle source

Fetches the URL passed in and returns a page.

     # File lib/mechanize.rb, line 220
220:   def get(options, parameters = [], referer = nil)
221:     verb = :get
222: 
223:     unless options.is_a? Hash
224:       url = options
225:       unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0
226:         referer = parameters
227:         parameters = []
228:       end
229:     else
230:       raise ArgumentError.new("url must be specified") unless url = options[:url]
231:       parameters = options[:params] || []
232:       referer    = options[:referer]
233:       headers    = options[:headers]
234:       verb       = options[:verb] || verb
235:     end
236: 
237:     unless referer
238:       if url.to_s =~ /^http/
239:         referer = Page.new(nil, {'content-type'=>'text/html'})
240:       else
241:         referer = current_page || Page.new(nil, {'content-type'=>'text/html'})
242:       end
243:     end
244: 
245:     # FIXME: Huge hack so that using a URI as a referer works.  I need to
246:     # refactor everything to pass around URIs but still support
247:     # Mechanize::Page#base
248:     unless referer.is_a?(Mechanize::File)
249:       referer = referer.is_a?(String) ?
250:       Page.new(URI.parse(referer), {'content-type' => 'text/html'}) :
251:         Page.new(referer, {'content-type' => 'text/html'})
252:     end
253: 
254:     # fetch the page
255:     page = fetch_page(  :uri      => url,
256:                         :referer  => referer,
257:                         :headers  => headers || {},
258:                         :verb     => verb,
259:                         :params   => parameters
260:                         )
261:     add_to_history(page)
262:     yield page if block_given?
263:     page
264:   end
get_file(url) click to toggle source

Fetch a file and return the contents of the file.

     # File lib/mechanize.rb, line 305
305:   def get_file(url)
306:     get(url).body
307:   end
head(url, query_params = {}, options = {}) click to toggle source
  

HEAD to url with query_params, and setting options:

  head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
     # File lib/mechanize.rb, line 291
291:   def head(url, query_params = {}, options = {})
292:     options = {
293:       :uri      => url,
294:       :headers  => {},
295:       :params   => query_params,
296:       :verb     => :head
297:     }.merge(options)
298:     # fetch the page
299:     page = fetch_page(options)
300:     yield page if block_given?
301:     page
302:   end
log() click to toggle source
     # File lib/mechanize.rb, line 185
185:   def log; self.class.log end
log=(l) click to toggle source
     # File lib/mechanize.rb, line 184
184:   def log=(l); self.class.log = l end
max_history() click to toggle source
     # File lib/mechanize.rb, line 183
183:   def max_history; @history.max_size end
max_history=(length) click to toggle source
     # File lib/mechanize.rb, line 182
182:   def max_history=(length); @history.max_size = length end
page() click to toggle source
Alias for: current_page
post(url, query={}, headers={}) click to toggle source

Posts to the given URL with the request entity. The request entity is specified by either a string, or a list of key-value pairs represented by a hash or an array of arrays.

Examples:

 agent.post('http://example.com/', "foo" => "bar")

 agent.post('http://example.com/', [ ["foo", "bar"] ])

 agent.post('http://example.com/', "<message>hello</message>", 'Content-Type' => 'application/xml')
     # File lib/mechanize.rb, line 334
334:   def post(url, query={}, headers={})
335:     if query.is_a?(String)
336:       return request_with_entity(:post, url, query, :headers => headers)
337:     end
338:     node = {}
339:     # Create a fake form
340:     class << node
341:       def search(*args); []; end
342:     end
343:     node['method'] = 'POST'
344:     node['enctype'] = 'application/x-www-form-urlencoded'
345: 
346:     form = Form.new(node)
347:     query.each { |k,v|
348:       if v.is_a?(IO)
349:         form.enctype = 'multipart/form-data'
350:         ul = Form::FileUpload.new({'name' => k.to_s},::File.basename(v.path))
351:         ul.file_data = v.read
352:         form.file_uploads << ul
353:       else
354:         form.fields << Form::Field.new({'name' => k.to_s},v)
355:       end
356:     }
357:     post_form(url, form, headers)
358:   end
post_connect_hooks() click to toggle source
     # File lib/mechanize.rb, line 191
191:   def post_connect_hooks
192:     @post_connect_hook.hooks
193:   end
pre_connect_hooks() click to toggle source
     # File lib/mechanize.rb, line 187
187:   def pre_connect_hooks
188:     @pre_connect_hook.hooks
189:   end
put(url, entity, options = {}) click to toggle source
  

PUT to url with entity, and setting options:

  put('http://tenderlovemaking.com/', 'new content', :headers => {'Content-Type' => 'text/plain'})
     # File lib/mechanize.rb, line 271
271:   def put(url, entity, options = {})
272:     request_with_entity(:put, url, entity, options)
273:   end
request_with_entity(verb, url, entity, options={}) click to toggle source
     # File lib/mechanize.rb, line 382
382:   def request_with_entity(verb, url, entity, options={})
383:     cur_page = current_page || Page.new( nil, {'content-type'=>'text/html'})
384: 
385:     options = {
386:       :uri      => url,
387:       :referer  => cur_page,
388:       :headers  => {},
389:     }.update(options)
390: 
391:     headers = {
392:       'Content-Type' => 'application/octet-stream',
393:       'Content-Length' => entity.size.to_s,
394:     }.update(options[:headers])
395: 
396:     options.update({
397:                      :verb => verb,
398:                      :params => [entity],
399:                      :headers => headers,
400:                    })
401: 
402:     page = fetch_page(options)
403:     add_to_history(page)
404:     page
405:   end
set_proxy(addr, port, user = nil, pass = nil) click to toggle source

Sets the proxy address, port, user, and password addr should be a host, with no “http://“

     # File lib/mechanize.rb, line 197
197:   def set_proxy(addr, port, user = nil, pass = nil)
198:     @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
199:   end
submit(form, button=nil, headers={}) click to toggle source

Submit a form with an optional button. Without a button:

 page = agent.get('http://example.com')
 agent.submit(page.forms.first)

With a button

 agent.submit(page.forms.first, page.forms.first.buttons.first)
     # File lib/mechanize.rb, line 366
366:   def submit(form, button=nil, headers={})
367:     form.add_button_to_query(button) if button
368:     case form.method.upcase
369:     when 'POST'
370:       post_form(form.action, form, headers)
371:     when 'GET'
372:       get(  :url      => form.action.gsub(/\?[^\?]*$/, ''),
373:             :params   => form.build_query,
374:             :headers  => headers,
375:             :referer  => form.page
376:             )
377:     else
378:       raise "unsupported method: #{form.method.upcase}"
379:     end
380:   end
transact() click to toggle source

Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.

     # File lib/mechanize.rb, line 427
427:   def transact
428:     history_backup = @history.dup
429:     begin
430:       yield self
431:     ensure
432:       @history = history_backup
433:     end
434:   end
user_agent_alias=(al) click to toggle source

Set the user agent for the Mechanize object. See AGENT_ALIASES

     # File lib/mechanize.rb, line 203
203:   def user_agent_alias=(al)
204:     self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
205:   end
visited?(url) click to toggle source

Returns whether or not a url has been visited

     # File lib/mechanize.rb, line 413
413:   def visited?(url)
414:     ! visited_page(url).nil?
415:   end
visited_page(url) click to toggle source

Returns a visited page for the url passed in, otherwise nil

     # File lib/mechanize.rb, line 418
418:   def visited_page(url)
419:     if url.respond_to? :href
420:       url = url.href
421:     end
422:     @history.visited_page(resolve(url))
423:   end

Private Instance Methods

add_to_history(page) click to toggle source
     # File lib/mechanize.rb, line 638
638:   def add_to_history(page)
639:     @history.push(page, resolve(page.uri))
640:     history_added.call(page) if history_added
641:   end
fetch_page(params) click to toggle source

uri is an absolute URI

     # File lib/mechanize.rb, line 470
470:   def fetch_page(params)
471:     options = {
472:       :request    => nil,
473:       :response   => nil,
474:       :connection => nil,
475:       :referer    => current_page(),
476:       :uri        => nil,
477:       :verb       => :get,
478:       :agent      => self,
479:       :redirects  => 0,
480:       :params     => [],
481:       :headers    => {},
482:     }.merge(params)
483: 
484:     before_connect = Chain.new([
485:                                 Chain::URIResolver.new(@scheme_handlers),
486:                                 Chain::ParameterResolver.new,
487:                                 Chain::RequestResolver.new,
488:                                 Chain::ConnectionResolver.new(
489:                                                               @connection_cache,
490:                                                               @keep_alive,
491:                                                               @proxy_addr,
492:                                                               @proxy_port,
493:                                                               @proxy_user,
494:                                                               @proxy_pass
495:                                                               ),
496:                                 Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass),
497:                                 Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest),
498:                                 Chain::HeaderResolver.new(
499:                                                           @keep_alive,
500:                                                           @keep_alive_time,
501:                                                           @cookie_jar,
502:                                                           @user_agent,
503:                                                           @gzip_enabled,
504:                                                           @request_headers
505:                                                           ),
506:                                 Chain::CustomHeaders.new,
507:                                 @pre_connect_hook,
508:                                ])
509:     before_connect.handle(options)
510: 
511:     uri           = options[:uri]
512:     request       = options[:request]
513:     cur_page      = options[:referer]
514:     request_data  = options[:params]
515:     redirects     = options[:redirects]
516:     http_obj      = options[:connection]
517: 
518:     # Add If-Modified-Since if page is in history
519:     if( (page = visited_page(uri)) && page.response['Last-Modified'] )
520:       request['If-Modified-Since'] = page.response['Last-Modified']
521:     end if(@conditional_requests)
522: 
523:     http_obj.mu_lock
524:     # Specify timeouts if given
525:     http_obj.open_timeout = @open_timeout if @open_timeout
526:     http_obj.read_timeout = @read_timeout if @read_timeout
527:     http_obj.start unless http_obj.started?
528: 
529:     # Log specified headers for the request
530:     log.info("#{ request.class }: #{ request.path }") if log
531:     request.each_header do |k, v|
532:       log.debug("request-header: #{ k } => #{ v }")
533:     end if log
534: 
535:     # Send the request
536:     attempts = 0
537:     begin
538:       response = http_obj.request(request, *request_data) { |r|
539:         connection_chain = Chain.new([
540:                                       Chain::ResponseReader.new(r),
541:                                       Chain::BodyDecodingHandler.new,
542:                                      ])
543:         connection_chain.handle(options)
544:       }
545:     rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x
546:       log.error("Rescuing EOF error") if log
547:       http_obj.finish
548:       raise x if attempts >= 2
549:       request.body = nil
550:       http_obj.start
551:       attempts += 1
552:       retry
553:     end
554: 
555:     after_connect = Chain.new([
556:                                @post_connect_hook,
557:                                Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set),
558:                                Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache),
559:                               ])
560:     after_connect.handle(options)
561:     http_obj.mu_unlock
562: 
563:     res_klass = options[:res_klass]
564:     response_body = options[:response_body]
565:     page = options[:page]
566: 
567:     log.info("status: #{ page.code }") if log
568: 
569:     if follow_meta_refresh
570:       redirect_uri  = nil
571:       referer       = page
572:       if (page.respond_to?(:meta) && (redirect = page.meta.first))
573:         redirect_uri = redirect.uri.to_s
574:         sleep redirect.node['delay'].to_f
575:         referer = Page.new(nil, {'content-type'=>'text/html'})
576:       elsif refresh = response['refresh']
577:         delay, redirect_uri = Page::Meta.parse(refresh, uri)
578:         raise StandardError, "Invalid refresh http header" unless delay
579:         if redirects + 1 > redirection_limit
580:           raise RedirectLimitReachedError.new(page, redirects)
581:         end
582:         sleep delay.to_f
583:       end
584:       if redirect_uri
585:         @history.push(page, page.uri)
586:         return fetch_page(
587:                           :uri        => redirect_uri,
588:                           :referer    => referer,
589:                           :params     => [],
590:                           :verb       => :get,
591:                           :redirects  => redirects + 1
592:                           )
593:       end
594:     end
595: 
596:     return page if res_klass <= Net::HTTPSuccess
597: 
598:     if res_klass == Net::HTTPNotModified
599:       log.debug("Got cached page") if log
600:       return visited_page(uri) || page
601:     elsif res_klass <= Net::HTTPRedirection
602:       return page unless follow_redirect?
603:       log.info("follow redirect to: #{ response['Location'] }") if log
604:       from_uri  = page.uri
605:       raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit
606:       redirect_verb = options[:verb] == :head ? :head : :get
607:       page = fetch_page(  :uri => response['Location'].to_s,
608:                           :referer => page,
609:                           :params  => [],
610:                           :verb => redirect_verb,
611:                           :redirects => redirects + 1
612:                           )
613:       @history.push(page, from_uri)
614:       return page
615:     elsif res_klass <= Net::HTTPUnauthorized
616:       raise ResponseCodeError.new(page) unless @user || @password
617:       raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
618:       if response['www-authenticate'] =~ /Digest/
619:         @auth_hash[uri.host] = :digest
620:         if response['server'] =~ /Microsoft-IIS/
621:           @auth_hash[uri.host] = :iis_digest
622:         end
623:         @digest = response['www-authenticate']
624:       else
625:         @auth_hash[uri.host] = :basic
626:       end
627:       return fetch_page(  :uri      => uri,
628:                           :referer  => cur_page,
629:                           :verb     => request.method.downcase.to_sym,
630:                           :params   => request_data,
631:                           :headers  => options[:headers]
632:                           )
633:     end
634: 
635:     raise ResponseCodeError.new(page), "Unhandled response", caller
636:   end
post_form(url, form, headers = {}) click to toggle source
     # File lib/mechanize.rb, line 448
448:   def post_form(url, form, headers = {})
449:     cur_page = form.page || current_page ||
450:       Page.new( nil, {'content-type'=>'text/html'})
451: 
452:     request_data = form.request_data
453: 
454:     log.debug("query: #{ request_data.inspect }") if log
455: 
456:     # fetch the page
457:     page = fetch_page(  :uri      => url,
458:                         :referer  => cur_page,
459:                         :verb     => :post,
460:                         :params   => [request_data],
461:                         :headers  => {
462:                           'Content-Type'    => form.enctype,
463:                           'Content-Length'  => request_data.size.to_s,
464:                         }.merge(headers))
465:     add_to_history(page)
466:     page
467:   end
resolve(url, referer = current_page()) click to toggle source
     # File lib/mechanize.rb, line 440
440:   def resolve(url, referer = current_page())
441:     hash = { :uri => url, :referer => referer }
442:     chain = Chain.new([
443:                        Chain::URIResolver.new(@scheme_handlers)
444:                       ]).handle(hash)
445:     hash[:uri].to_s
446:   end

Disabled; run with --debug to generate this.

[Validate]

Generated with the Darkfish Rdoc Generator 1.1.6.