class PDF::Reader::PageTextReceiver

Builds a UTF-8 string of all the text on a single page by processing all the operaters in a content stream.

Constants

SPACE

Attributes

options[R]
state[R]

Public Instance Methods

content() click to toggle source

deprecated

# File lib/pdf/reader/page_text_receiver.rb, line 75
def content
  mediabox = @page.rectangles[:MediaBox]
  PageLayout.new(runs, mediabox).to_s
end
invoke_xobject(label) click to toggle source

XObjects

# File lib/pdf/reader/page_text_receiver.rb, line 114
def invoke_xobject(label)
  @state.invoke_xobject(label) do |xobj|
    case xobj
    when PDF::Reader::FormXObject then
      xobj.walk(self)
    end
  end
end
move_to_next_line_and_show_text(str) click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 100
def move_to_next_line_and_show_text(str) # '
  @state.move_to_start_of_next_line
  show_text(str)
end
page=(page) click to toggle source

starting a new page

# File lib/pdf/reader/page_text_receiver.rb, line 43
def page=(page)
  @state = PageState.new(page)
  @page = page
  @content = []
  @characters = []
end
runs(opts = {}) click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 50
def runs(opts = {})
  runs = @characters

  if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
    runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
  end

  if opts.fetch(:skip_zero_width, true)
    runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
  end

  if opts.fetch(:skip_overlapping, true)
    runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
  end

  runs = NoTextFilter.exclude_empty_strings(runs)

  if opts.fetch(:merge, true)
    runs = merge_runs(runs)
  end

  runs
end
set_spacing_next_line_show_text(aw, ac, string) click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 105
def set_spacing_next_line_show_text(aw, ac, string) # "
  @state.set_word_spacing(aw)
  @state.set_character_spacing(ac)
  move_to_next_line_and_show_text(string)
end
show_text(string) click to toggle source

Text Showing Operators

record text that is drawn on the page

# File lib/pdf/reader/page_text_receiver.rb, line 84
def show_text(string) # Tj (AWAY)
  internal_show_text(string)
end
show_text_with_positioning(params) click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 88
def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
  params.each do |arg|
    if arg.is_a?(String)
      internal_show_text(arg)
    elsif arg.is_a?(Numeric)
      @state.process_glyph_displacement(0, arg, false)
    else
      # skip it
    end
  end
end

Private Instance Methods

apply_rotation(x, y) click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 150
def apply_rotation(x, y)
  if @page.rotate == 90
    tmp = x
    x = y
    y = tmp * -1
  elsif @page.rotate == 180
    y *= -1
    x *= -1
  elsif @page.rotate == 270
    tmp = y
    y = x
    x = tmp * -1
  end
  return x, y
end
group_chars_into_runs(chars) click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 176
def group_chars_into_runs(chars)
  chars.each_with_object([]) do |char, runs|
    if runs.empty?
      runs << char
    elsif runs.last.mergable?(char)
      runs[-1] = runs.last + char
    else
      runs << char
    end
  end
end
internal_show_text(string) click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 125
def internal_show_text(string)
  PDF::Reader::Error.validate_type_as_malformed(string, "string", String)
  if @state.current_font.nil?
    raise PDF::Reader::MalformedPDFError, "current font is invalid"
  end
  glyphs = @state.current_font.unpack(string)
  glyphs.each_with_index do |glyph_code, index|
    # paint the current glyph
    newx, newy = @state.trm_transform(0,0)
    newx, newy = apply_rotation(newx, newy)

    utf8_chars = @state.current_font.to_utf8(glyph_code)

    # apply to glyph displacment for the current glyph so the next
    # glyph will appear in the correct position
    glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
    th = 1
    scaled_glyph_width = glyph_width * @state.font_size * th
    unless utf8_chars == SPACE
      @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
    end
    @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE)
  end
end
merge_runs(runs) click to toggle source

take a collection of TextRun objects and merge any that are in close proximity

# File lib/pdf/reader/page_text_receiver.rb, line 168
def merge_runs(runs)
  runs.group_by { |char|
    char.y.to_i
  }.map { |y, chars|
    group_chars_into_runs(chars.sort)
  }.flatten.sort
end