3 module ActiveSupport
#:nodoc:
4 module Multibyte
#:nodoc:
5 # Chars enables you to work transparently with UTF-8 encoding in the Ruby String class without having extensive
6 # knowledge about the encoding. A Chars object accepts a string upon initialization and proxies String methods in an
7 # encoding safe manner. All the normal String methods are also implemented on the proxy.
9 # String methods are proxied through the Chars object, and can be accessed through the +mb_chars+ method. Methods
10 # which would normally return a String object now return a Chars object so methods can be chained.
12 # "The Perfect String ".mb_chars.downcase.strip.normalize #=> "the perfect string"
14 # Chars objects are perfectly interchangeable with String objects as long as no explicit class checks are made.
15 # If certain methods do explicitly check the class, call +to_s+ before you pass chars objects to them.
17 # bad.explicit_checking_method "T".mb_chars.downcase.to_s
19 # The default Chars implementation assumes that the encoding of the string is UTF-8, if you want to handle different
20 # encodings you can write your own multibyte string handler and configure it through
21 # ActiveSupport::Multibyte.proxy_class.
25 # @wrapped_string.size / 4
28 # def self.accepts?(string)
29 # string.length % 4 == 0
33 # ActiveSupport::Multibyte.proxy_class = CharsForUTF32
35 # Hangul character boundaries and properties
43 HANGUL_NCOUNT
= HANGUL_VCOUNT
* HANGUL_TCOUNT
45 HANGUL_SLAST
= HANGUL_SBASE
+ HANGUL_SCOUNT
46 HANGUL_JAMO_FIRST
= 0x1100
47 HANGUL_JAMO_LAST
= 0x11FF
49 # All the unicode whitespace
50 UNICODE_WHITESPACE
= [
51 (0x0009..0x000D
).to_a
, # White_Space # Cc [5] <control-0009>..<control-000D>
52 0x0020, # White_Space # Zs SPACE
53 0x0085, # White_Space # Cc <control-0085>
54 0x00A0, # White_Space # Zs NO-BREAK SPACE
55 0x1680, # White_Space # Zs OGHAM SPACE MARK
56 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
57 (0x2000..0x200A
).to_a
, # White_Space # Zs [11] EN QUAD..HAIR SPACE
58 0x2028, # White_Space # Zl LINE SEPARATOR
59 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
60 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
61 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
62 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
65 # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
66 # between little and big endian. This is not an issue in utf-8, so it must be ignored.
67 UNICODE_LEADERS_AND_TRAILERS
= UNICODE_WHITESPACE
+ [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
69 # Returns a regular expression pattern that matches the passed Unicode codepoints
70 def self.codepoints_to_pattern(array_of_codepoints
) #:nodoc:
71 array_of_codepoints
.collect
{ |e
| [e
].pack
'U*' }.join('|')
73 UNICODE_TRAILERS_PAT
= /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
74 UNICODE_LEADERS_PAT
= /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
76 # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
79 [\xc2-\xdf] [\x80-\xbf] |
80 \xe0 [\xa0-\xbf] [\x80-\xbf] |
81 [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
82 \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
83 [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
84 \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
87 attr_reader
:wrapped_string
88 alias to_s wrapped_string
89 alias to_str wrapped_string
91 if '1.9'.respond_to
?(:force_encoding)
92 # Creates a new Chars instance by wrapping _string_.
93 def initialize(string
)
94 @wrapped_string = string
95 @wrapped_string.force_encoding(Encoding
::UTF_8) unless @wrapped_string.frozen
?
98 def initialize(string
) #:nodoc:
99 @wrapped_string = string
103 # Forward all undefined methods to the wrapped string.
104 def method_missing(method
, *args
, &block
)
105 if method
.to_s
=~
/!$/
106 @wrapped_string.__send__(method
, *args
, &block
)
109 result
= @wrapped_string.__send__(method
, *args
, &block
)
110 result
.kind_of
?(String
) ? chars(result
) : result
114 # Returns +true+ if _obj_ responds to the given method. Private methods are included in the search
115 # only if the optional second parameter evaluates to +true+.
116 def respond_to
?(method
, include_private
=false)
117 super || @wrapped_string.respond_to
?(method
, include_private
) || false
120 # Enable more predictable duck-typing on String-like classes. See Object#acts_like?.
121 def acts_like_string
?
125 # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns
127 def self.wants
?(string
)
128 $KCODE == 'UTF8' && consumes
?(string
)
131 # Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise.
132 def self.consumes
?(string
)
133 # Unpack is a little bit faster than regular expressions.
142 # Returns <tt>-1</tt>, <tt>0</tt> or <tt>+1</tt> depending on whether the Chars object is to be sorted before,
143 # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+.
144 # See <tt>String#<=></tt> for more details.
147 # 'é'.mb_chars <=> 'ü'.mb_chars #=> -1
149 @wrapped_string <=> other
.to_s
152 # Returns a new Chars object containing the _other_ object concatenated to the string.
155 # ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl"
160 # Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset.
163 # 'Café périferôl'.mb_chars =~ /ô/ #=> 12
165 translate_offset(@wrapped_string =~ other
)
168 # Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars
169 # instances instead of String. This makes chaining methods easier.
172 # 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"]
174 @wrapped_string.split(*args
).map
{ |i
| i
.mb_chars
}
177 # Inserts the passed string at specified codepoint offsets.
180 # 'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl"
181 def insert(offset
, fragment
)
182 unpacked
= self.class.u_unpack(@wrapped_string)
183 unless offset
> unpacked
.length
184 @wrapped_string.replace(
185 self.class.u_unpack(@wrapped_string).insert(offset
, *self.class.u_unpack(fragment
)).pack('U*')
188 raise IndexError
, "index #{offset} out of string"
193 # Returns +true+ if contained string contains _other_. Returns +false+ otherwise.
196 # 'Café'.mb_chars.include?('é') #=> true
198 # We have to redefine this method because Enumerable defines it.
199 @wrapped_string.include?(other
)
202 # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found.
205 # 'Café périferôl'.mb_chars.index('ô') #=> 12
206 # 'Café périferôl'.mb_chars.index(/\w/u) #=> 0
207 def index(needle
, offset
=0)
208 index
= @wrapped_string.index(needle
, offset
)
209 index
? (self.class.u_unpack(@wrapped_string.slice(0...index
)).size
) : nil
212 # Like <tt>String#[]=</tt>, except instead of byte offsets you specify character offsets.
217 # s.mb_chars[2] = "e" # Replace character with offset 2
222 # s.mb_chars[1, 2] = "ö" # Replace 2 characters at character offset 1
226 replace_by
= args
.pop
227 # Indexed replace with regular expressions already works
228 if args
.first
.is_a
?(Regexp
)
229 @wrapped_string[*args
] = replace_by
231 result
= self.class.u_unpack(@wrapped_string)
232 if args
[0].is_a
?(Fixnum
)
233 raise IndexError
, "index #{args[0]} out of string" if args
[0] >= result
.length
235 max
= args
[1].nil? ? min
: (min
+ args
[1] - 1)
236 range
= Range
.new(min
, max
)
237 replace_by
= [replace_by
].pack('U') if replace_by
.is_a
?(Fixnum
)
238 elsif args
.first
.is_a
?(Range
)
239 raise RangeError
, "#{args[0]} out of range" if args
[0].min
>= result
.length
242 needle
= args
[0].to_s
244 max
= min
+ self.class.u_unpack(needle
).length
- 1
245 range
= Range
.new(min
, max
)
247 result
[range
] = self.class.u_unpack(replace_by
)
248 @wrapped_string.replace(result
.pack('U*'))
252 # Works just like <tt>String#rjust</tt>, only integer specifies characters instead of bytes.
256 # "¾ cup".mb_chars.rjust(8).to_s
259 # "¾ cup".mb_chars.rjust(8, " ").to_s # Use non-breaking whitespace
261 def rjust(integer
, padstr
=' ')
262 justify(integer
, :right, padstr
)
265 # Works just like <tt>String#ljust</tt>, only integer specifies characters instead of bytes.
269 # "¾ cup".mb_chars.rjust(8).to_s
272 # "¾ cup".mb_chars.rjust(8, " ").to_s # Use non-breaking whitespace
274 def ljust(integer
, padstr
=' ')
275 justify(integer
, :left, padstr
)
278 # Works just like <tt>String#center</tt>, only integer specifies characters instead of bytes.
282 # "¾ cup".mb_chars.center(8).to_s
285 # "¾ cup".mb_chars.center(8, " ").to_s # Use non-breaking whitespace
287 def center(integer
, padstr
=' ')
288 justify(integer
, :center, padstr
)
291 # Strips entire range of Unicode whitespace from the right of the string.
293 chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT
, ''))
296 # Strips entire range of Unicode whitespace from the left of the string.
298 chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT
, ''))
301 # Strips entire range of Unicode whitespace from the right and left of the string.
306 # Returns the number of codepoints in the string
308 self.class.u_unpack(@wrapped_string).size
310 alias_method
:length, :size
312 # Reverses all characters in the string.
315 # 'Café'.mb_chars.reverse.to_s #=> 'éfaC'
317 chars(self.class.u_unpack(@wrapped_string).reverse
.pack('U*'))
320 # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
324 # 'こんにちは'.mb_chars.slice(2..3).to_s #=> "にち"
327 raise ArgumentError
, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native
328 elsif (args
.size
== 2 && !(args
.first
.is_a
?(Numeric
) || args
.first
.is_a
?(Regexp
)))
329 raise TypeError
, "cannot convert #{args.first.class} into Integer" # Do as if we were native
330 elsif (args
.size
== 2 && !args
[1].is_a
?(Numeric
))
331 raise TypeError
, "cannot convert #{args[1].class} into Integer" # Do as if we were native
332 elsif args
[0].kind_of
? Range
333 cps
= self.class.u_unpack(@wrapped_string).slice(*args
)
334 result
= cps
.nil? ? nil : cps
.pack('U*')
335 elsif args
[0].kind_of
? Regexp
336 result
= @wrapped_string.slice(*args
)
337 elsif args
.size
== 1 && args
[0].kind_of
?(Numeric
)
338 character
= self.class.u_unpack(@wrapped_string)[args
[0]]
339 result
= character
.nil? ? nil : [character
].pack('U')
341 result
= self.class.u_unpack(@wrapped_string).slice(*args
).pack('U*')
343 result
.nil? ? nil : chars(result
)
345 alias_method
:[], :slice
347 # Like <tt>String#slice!</tt>, except instead of byte offsets you specify character offsets.
351 # s.mb_chars.slice!(2..3).to_s #=> "にち"
359 # Returns the codepoint of the first character in the string.
362 # 'こんにちは'.mb_chars.ord #=> 12371
364 self.class.u_unpack(@wrapped_string)[0]
367 # Convert characters in the string to uppercase.
370 # 'Laurent, òu sont les tests?'.mb_chars.upcase.to_s #=> "LAURENT, ÒU SONT LES TESTS?"
372 apply_mapping
:uppercase_mapping
375 # Convert characters in the string to lowercase.
378 # 'VĚDA A VÝZKUM'.mb_chars.downcase.to_s #=> "věda a výzkum"
380 apply_mapping
:lowercase_mapping
383 # Converts the first character to uppercase and the remainder to lowercase.
386 # 'über'.mb_chars.capitalize.to_s #=> "Über"
388 (slice(0) || chars('')).upcase
+ (slice(1..-1) || chars('')).downcase
391 # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
392 # passing strings to databases and validations.
394 # * <tt>str</tt> - The string to perform normalization on.
395 # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
396 # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
397 # ActiveSupport::Multibyte.default_normalization_form
398 def normalize(form
=ActiveSupport
::Multibyte.default_normalization_form
)
399 # See http://www.unicode.org/reports/tr15, Table 1
400 codepoints
= self.class.u_unpack(@wrapped_string)
403 self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints
))
405 self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints
)))
407 self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints
))
409 self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints
)))
411 raise ArgumentError
, "#{form} is not a valid normalization variant", caller
415 # Performs canonical decomposition on all the characters.
419 # 'é'.mb_chars.decompose.to_s.length #=> 3
421 chars(self.class.decompose_codepoints(:canonical, self.class.u_unpack(@wrapped_string)).pack('U*'))
424 # Performs composition on all the characters.
428 # 'é'.mb_chars.compose.to_s.length #=> 2
430 chars(self.class.compose_codepoints(self.class.u_unpack(@wrapped_string)).pack('U*'))
433 # Returns the number of grapheme clusters in the string.
436 # 'क्षि'.mb_chars.length #=> 4
437 # 'क्षि'.mb_chars.g_length #=> 3
439 self.class.g_unpack(@wrapped_string).length
442 # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
444 chars(self.class.tidy_bytes(@wrapped_string))
447 %w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize
).each
do |method
|
448 define_method("#{method}!") do |*args
|
450 @wrapped_string = send(method
, *args
).to_s
452 @wrapped_string = send(method
).to_s
460 # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't
464 # Chars.u_unpack('Café') #=> [67, 97, 102, 233]
469 raise EncodingError
, 'malformed UTF-8 character'
473 # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified
474 # character class and +false+ otherwise. Valid character classes are: <tt>:cr</tt>, <tt>:lf</tt>, <tt>:l</tt>,
475 # <tt>:v</tt>, <tt>:lv</tt>, <tt>:lvt</tt> and <tt>:t</tt>.
477 # Primarily used by the grapheme cluster support.
478 def in_char_class
?(codepoint
, classes
)
479 classes
.detect
{ |c
| UCD
.boundary
[c
] === codepoint
} ? true : false
482 # Unpack the string at grapheme boundaries. Returns a list of character lists.
485 # Chars.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]]
486 # Chars.g_unpack('Café') #=> [[67], [97], [102], [233]]
488 codepoints
= u_unpack(string
)
492 eoc
= codepoints
.length
495 previous
= codepoints
[pos-1
]
496 current
= codepoints
[pos
]
499 one
= ( previous
== UCD
.boundary
[:cr] and current
== UCD
.boundary
[:lf] ) or
501 two
= ( UCD
.boundary
[:l] === previous
and in_char_class
?(current
, [:l,:v,:lv,:lvt]) ) or
503 three
= ( in_char_class
?(previous
, [:lv,:v]) and in_char_class
?(current
, [:v,:t]) ) or
505 four
= ( in_char_class
?(previous
, [:lvt,:t]) and UCD
.boundary
[:t] === current
) or
507 five
= (UCD
.boundary
[:extend] === current
)
510 unpacked
<< codepoints
[marker
..pos-1
]
517 # Reverse operation of g_unpack.
520 # Chars.g_pack(Chars.g_unpack('क्षि')) #=> 'क्षि'
522 (unpacked
.flatten
).pack('U*')
525 def padding(padsize
, padstr
=' ') #:nodoc:
527 new(padstr
* ((padsize
/ u_unpack(padstr
).size
) + 1)).slice(0, padsize
)
533 # Re-order codepoints so the string becomes canonical.
534 def reorder_characters(codepoints
)
535 length
= codepoints
.length-
1
537 while pos
< length
do
538 cp1
, cp2
= UCD
.codepoints
[codepoints
[pos
]], UCD
.codepoints
[codepoints
[pos
+1]]
539 if (cp1
.combining_class
> cp2
.combining_class
) && (cp2
.combining_class
> 0)
540 codepoints
[pos
..pos
+1] = cp2
.code
, cp1
.code
541 pos
+= (pos
> 0 ? -1 : 1)
549 # Decompose composed characters to the decomposed form.
550 def decompose_codepoints(type
, codepoints
)
551 codepoints
.inject([]) do |decomposed
, cp
|
552 # if it's a hangul syllable starter character
553 if HANGUL_SBASE
<= cp
and cp
< HANGUL_SLAST
554 sindex
= cp
- HANGUL_SBASE
555 ncp
= [] # new codepoints
556 ncp
<< HANGUL_LBASE
+ sindex
/ HANGUL_NCOUNT
557 ncp
<< HANGUL_VBASE
+ (sindex
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
558 tindex
= sindex
% HANGUL_TCOUNT
559 ncp
<< (HANGUL_TBASE
+ tindex
) unless tindex
== 0
560 decomposed
.concat ncp
561 # if the codepoint is decomposable in with the current decomposition type
562 elsif (ncp
= UCD
.codepoints
[cp
].decomp_mapping
) and (!UCD
.codepoints
[cp
].decomp_type
|| type
== :compatability)
563 decomposed
.concat
decompose_codepoints(type
, ncp
.dup
)
570 # Compose decomposed characters to the composed form.
571 def compose_codepoints(codepoints
)
573 eoa
= codepoints
.length
- 1
575 starter_char
= codepoints
[0]
576 previous_combining_class
= -1
579 lindex
= starter_char
- HANGUL_LBASE
581 if 0 <= lindex
and lindex
< HANGUL_LCOUNT
582 vindex
= codepoints
[starter_pos
+1] - HANGUL_VBASE
rescue vindex
= -1
583 if 0 <= vindex
and vindex
< HANGUL_VCOUNT
584 tindex
= codepoints
[starter_pos
+2] - HANGUL_TBASE
rescue tindex
= -1
585 if 0 <= tindex
and tindex
< HANGUL_TCOUNT
593 codepoints
[starter_pos
..j
] = (lindex
* HANGUL_VCOUNT
+ vindex
) * HANGUL_TCOUNT
+ tindex
+ HANGUL_SBASE
596 starter_char
= codepoints
[starter_pos
]
597 # -- Other characters
599 current_char
= codepoints
[pos
]
600 current
= UCD
.codepoints
[current_char
]
601 if current
.combining_class
> previous_combining_class
602 if ref
= UCD
.composition_map
[starter_char
]
603 composition
= ref
[current_char
]
607 unless composition
.nil?
608 codepoints
[starter_pos
] = composition
609 starter_char
= composition
610 codepoints
.delete_at pos
613 previous_combining_class
= -1
615 previous_combining_class
= current
.combining_class
618 previous_combining_class
= current
.combining_class
620 if current
.combining_class
== 0
622 starter_char
= codepoints
[pos
]
629 # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
630 def tidy_bytes(string
)
631 string
.split(//u
).map
do |c
|
632 c
.force_encoding(Encoding
::ASCII) if c
.respond_to
?(:force_encoding)
634 if !UTF8_PAT
.match(c
)
637 n
< 160 ? [UCD
.cp1252
[n
] || n
].pack('U') :
638 n
< 192 ? "\xC2" + n
.chr
: "\xC3" + (n-64
).chr
648 def translate_offset(byte_offset
) #:nodoc:
649 return nil if byte_offset
.nil?
650 return 0 if @wrapped_string == ''
651 chunk
= @wrapped_string[0..byte_offset
]
654 chunk
.unpack('U*').length
- 1
655 rescue ArgumentError
=> e
656 chunk
= @wrapped_string[0..(byte_offset
+=1)]
657 # Stop retrying at the end of the string
658 raise e
unless byte_offset
< chunk
.length
659 # We damaged a character, retry
662 # Catch the ArgumentError so we can throw our own
664 raise EncodingError
, 'malformed UTF-8 character'
668 def justify(integer
, way
, padstr
=' ') #:nodoc:
669 raise ArgumentError
, "zero width padding" if padstr
.length
== 0
670 padsize
= integer
- size
671 padsize
= padsize
> 0 ? padsize
: 0
674 result
= @wrapped_string.dup
.insert(0, self.class.padding(padsize
, padstr
))
676 result
= @wrapped_string.dup
.insert(-1, self.class.padding(padsize
, padstr
))
678 lpad
= self.class.padding((padsize
/ 2.0).floor
, padstr
)
679 rpad
= self.class.padding((padsize
/ 2.0).ceil
, padstr
)
680 result
= @wrapped_string.dup
.insert(0, lpad
).insert(-1, rpad
)
685 def apply_mapping(mapping
) #:nodoc:
686 chars(self.class.u_unpack(@wrapped_string).map
do |codepoint
|
687 cp
= UCD
.codepoints
[codepoint
]
688 if cp
and (ncp
= cp
.send(mapping
)) and ncp
> 0
696 def chars(string
) #:nodoc:
697 self.class.new(string
)