class PDF::Reader::PageLayout
Takes a collection of TextRun objects and renders them into a single string that best approximates the way they'd appear on a render PDF page.
media box should be a 4 number array that describes the dimensions of the page to be rendered as described by the page's MediaBox attribute
Public Class Methods
new(runs, mediabox)
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 10 def initialize(runs, mediabox) raise ArgumentError, "a mediabox must be provided" if mediabox.nil? @runs = merge_runs(runs) @mean_font_size = mean(@runs.map(&:font_size)) || 0 @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0 @page_width = mediabox[2] - mediabox[0] @page_height = mediabox[3] - mediabox[1] @x_offset = @runs.map(&:x).sort.first @current_platform_is_rbx_19 = RUBY_DESCRIPTION =~ /\Arubinius 2.0.0/ && RUBY_VERSION >= "1.9.0" end
Public Instance Methods
to_s()
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 23 def to_s return "" if @runs.empty? page = row_count.times.map { |i| " " * col_count } @runs.each do |run| x_pos = ((run.x - @x_offset) / col_multiplier).round y_pos = row_count - (run.y / row_multiplier).round if y_pos < row_count && y_pos >= 0 && x_pos < col_count && x_pos >= 0 local_string_insert(page[y_pos], run.text, x_pos) end end interesting_rows(page).map(&:rstrip).join("\n") end
Private Instance Methods
col_count()
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 60 def col_count @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor end
col_multiplier()
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 68 def col_multiplier @col_multiplier ||= @page_width.to_f / col_count.to_f end
each_line() { |y, collection| ... }
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 80 def each_line(&block) @runs.sort.group_by { |run| run.y.to_i }.map { |y, collection| yield y, collection } end
group_chars_into_runs(chars)
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 98 def group_chars_into_runs(chars) runs = [] while head = chars.shift if runs.empty? runs << head elsif runs.last.mergable?(head) runs[-1] = runs.last + head else runs << head end end runs end
interesting_rows(rows)
click to toggle source
given an array of strings, return a new array with empty rows from the beginning and end removed.
interesting_rows([ "", "one", "two", "" ]) => [ "one", "two" ]
# File lib/pdf/reader/page_layout.rb, line 45 def interesting_rows(rows) line_lengths = rows.map { |l| l.strip.length } return [] if line_lengths.all?(&:zero?) first_line_with_text = line_lengths.index { |l| l > 0 } last_line_with_text = line_lengths.size - line_lengths.reverse.index { |l| l > 0 } interesting_line_count = last_line_with_text - first_line_with_text rows[first_line_with_text, interesting_line_count].map end
local_string_insert(haystack, needle, index)
click to toggle source
This is a simple alternative to String#[]=. We can't use the string method as it's buggy on rubinius 2.0rc1 (in 1.9 mode)
See my bug report at github.com/rubinius/rubinius/issues/1985
# File lib/pdf/reader/page_layout.rb, line 116 def local_string_insert(haystack, needle, index) if @current_platform_is_rbx_19 char_count = needle.length haystack.replace( (haystack[0,index] || "") + needle + (haystack[index+char_count,500] || "") ) else haystack[Range.new(index, index + needle.length - 1)] = String.new(needle) end end
mean(collection)
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 72 def mean(collection) if collection.size == 0 0 else collection.inject(0) { |accum, v| accum + v} / collection.size.to_f end end
merge_runs(runs)
click to toggle source
take a collection of TextRun objects and merge any that are in close proximity
# File lib/pdf/reader/page_layout.rb, line 90 def merge_runs(runs) runs.group_by { |char| char.y.to_i }.map { |y, chars| group_chars_into_runs(chars.sort) }.flatten.sort end
row_count()
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 56 def row_count @row_count ||= (@page_height / @mean_font_size).floor end
row_multiplier()
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 64 def row_multiplier @row_multiplier ||= @page_height.to_f / row_count.to_f end