Some useful additions to the String class Copyright (C) 2010, Jacques Distler. All rights reserved. Licensed under a triple GPL/MPL/LGPL License.
# File lib/itex_stringsupport.rb, line 34 def as_bytes force_encoding("ASCII-8BIT") end
# File lib/itex_stringsupport.rb, line 52 def as_utf8 force_encoding("UTF-8") end
# File lib/itex_stringsupport.rb, line 81 def check_ncrs text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' } text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' } end
Check whether a string is valid utf-8
returns true if the sequence of bytes in string is valid utf-8
# File lib/itex_stringsupport.rb, line 107 def is_utf8? #expand NCRs to utf-8 text = self.check_ncrs.as_bytes # You might think this is faster, but it isn't #pieces = self.split(/&#[xX]([a-fA-F0-9]+);/) #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')} #pieces = pieces.join.split(/&#(\d+);/) #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')} #text = pieces.join #ensure the resulting string of bytes is valid utf-8 text =~ UTF8_REGEX end
# File lib/itex_stringsupport.rb, line 16 def num_chars length end
# File lib/itex_stringsupport.rb, line 70 def purify text = self.dup.check_ncrs.as_utf8 text.chars.collect{|c| c.as_bytes}.grep(UTF8_REGEX).join.as_utf8 end
Converts XHTML+MathML named entities in string to Numeric Character References
# File lib/itex_stringsupport.rb, line 2267 def to_ncr self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr} end
Converts XHTML+MathML named entities in string to Numeric Character References
Substitution is done in-place.
# File lib/itex_stringsupport.rb, line 2278 def to_ncr! self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr} end
Converts XHTML+MathML named entities in string to UTF-8
# File lib/itex_stringsupport.rb, line 2288 def to_utf8 self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8} # You might think this is faster, but it isn't # pieces = self.split(/&([a-zA-Z0-9]+);/) # 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8} # pieces.join end
++
Converts XHTML+MathML named entities in string to UTF-8 Substitution is done in-place.
# File lib/itex_stringsupport.rb, line 2305 def to_utf8! self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8} end