Ruby  1.9.3p484(2013-11-22revision43786)
string.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  string.c -
4 
5  $Author: usa $
6  created at: Mon Aug 9 17:12:58 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "ruby/ruby.h"
15 #include "ruby/re.h"
16 #include "ruby/encoding.h"
17 #include "internal.h"
18 #include <assert.h>
19 
20 #define BEG(no) (regs->beg[(no)])
21 #define END(no) (regs->end[(no)])
22 
23 #include <math.h>
24 #include <ctype.h>
25 
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 
30 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
31 
32 #undef rb_str_new_cstr
33 #undef rb_tainted_str_new_cstr
34 #undef rb_usascii_str_new_cstr
35 #undef rb_external_str_new_cstr
36 #undef rb_locale_str_new_cstr
37 #undef rb_str_new2
38 #undef rb_str_new3
39 #undef rb_str_new4
40 #undef rb_str_new5
41 #undef rb_tainted_str_new2
42 #undef rb_usascii_str_new2
43 #undef rb_str_dup_frozen
44 #undef rb_str_buf_new_cstr
45 #undef rb_str_buf_new2
46 #undef rb_str_buf_cat2
47 #undef rb_str_cat2
48 
49 static VALUE rb_str_clear(VALUE str);
50 
53 
54 #define RUBY_MAX_CHAR_LEN 16
55 #define STR_TMPLOCK FL_USER7
56 #define STR_NOEMBED FL_USER1
57 #define STR_SHARED FL_USER2 /* = ELTS_SHARED */
58 #define STR_ASSOC FL_USER3
59 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
60 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
61 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
62 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
63 #define STR_UNSET_NOCAPA(s) do {\
64  if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
65 } while (0)
66 
67 
68 #define STR_SET_NOEMBED(str) do {\
69  FL_SET((str), STR_NOEMBED);\
70  STR_SET_EMBED_LEN((str), 0);\
71 } while (0)
72 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
73 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
74 #define STR_SET_EMBED_LEN(str, n) do { \
75  long tmp_n = (n);\
76  RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
77  RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
78 } while (0)
79 
80 #define STR_SET_LEN(str, n) do { \
81  if (STR_EMBED_P(str)) {\
82  STR_SET_EMBED_LEN((str), (n));\
83  }\
84  else {\
85  RSTRING(str)->as.heap.len = (n);\
86  }\
87 } while (0)
88 
89 #define STR_DEC_LEN(str) do {\
90  if (STR_EMBED_P(str)) {\
91  long n = RSTRING_LEN(str);\
92  n--;\
93  STR_SET_EMBED_LEN((str), n);\
94  }\
95  else {\
96  RSTRING(str)->as.heap.len--;\
97  }\
98 } while (0)
99 
100 #define RESIZE_CAPA(str,capacity) do {\
101  if (STR_EMBED_P(str)) {\
102  if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
103  char *tmp = ALLOC_N(char, (capacity)+1);\
104  memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
105  RSTRING(str)->as.heap.ptr = tmp;\
106  RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
107  STR_SET_NOEMBED(str);\
108  RSTRING(str)->as.heap.aux.capa = (capacity);\
109  }\
110  }\
111  else {\
112  REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
113  if (!STR_NOCAPA_P(str))\
114  RSTRING(str)->as.heap.aux.capa = (capacity);\
115  }\
116 } while (0)
117 
118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
120 
121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
122 
123 static inline int
125 {
126  rb_encoding *enc;
127 
128  /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
129  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
130  return 1;
131 
132  enc = STR_ENC_GET(str);
133  if (rb_enc_mbmaxlen(enc) == 1)
134  return 1;
135 
136  /* Conservative. Possibly single byte.
137  * "\xa1" in Shift_JIS for example. */
138  return 0;
139 }
140 
142 
143 static inline const char *
144 search_nonascii(const char *p, const char *e)
145 {
146 #if SIZEOF_VALUE == 8
147 # define NONASCII_MASK 0x8080808080808080ULL
148 #elif SIZEOF_VALUE == 4
149 # define NONASCII_MASK 0x80808080UL
150 #endif
151 #ifdef NONASCII_MASK
152  if ((int)sizeof(VALUE) * 2 < e - p) {
153  const VALUE *s, *t;
154  const VALUE lowbits = sizeof(VALUE) - 1;
155  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
156  while (p < (const char *)s) {
157  if (!ISASCII(*p))
158  return p;
159  p++;
160  }
161  t = (const VALUE*)(~lowbits & (VALUE)e);
162  while (s < t) {
163  if (*s & NONASCII_MASK) {
164  t = s;
165  break;
166  }
167  s++;
168  }
169  p = (const char *)t;
170  }
171 #endif
172  while (p < e) {
173  if (!ISASCII(*p))
174  return p;
175  p++;
176  }
177  return NULL;
178 }
179 
180 static int
181 coderange_scan(const char *p, long len, rb_encoding *enc)
182 {
183  const char *e = p + len;
184 
185  if (rb_enc_to_index(enc) == 0) {
186  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
187  p = search_nonascii(p, e);
189  }
190 
191  if (rb_enc_asciicompat(enc)) {
192  p = search_nonascii(p, e);
193  if (!p) {
194  return ENC_CODERANGE_7BIT;
195  }
196  while (p < e) {
197  int ret = rb_enc_precise_mbclen(p, e, enc);
198  if (!MBCLEN_CHARFOUND_P(ret)) {
199  return ENC_CODERANGE_BROKEN;
200  }
201  p += MBCLEN_CHARFOUND_LEN(ret);
202  if (p < e) {
203  p = search_nonascii(p, e);
204  if (!p) {
205  return ENC_CODERANGE_VALID;
206  }
207  }
208  }
209  if (e < p) {
210  return ENC_CODERANGE_BROKEN;
211  }
212  return ENC_CODERANGE_VALID;
213  }
214 
215  while (p < e) {
216  int ret = rb_enc_precise_mbclen(p, e, enc);
217 
218  if (!MBCLEN_CHARFOUND_P(ret)) {
219  return ENC_CODERANGE_BROKEN;
220  }
221  p += MBCLEN_CHARFOUND_LEN(ret);
222  }
223  if (e < p) {
224  return ENC_CODERANGE_BROKEN;
225  }
226  return ENC_CODERANGE_VALID;
227 }
228 
229 long
230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
231 {
232  const char *p = s;
233 
234  if (*cr == ENC_CODERANGE_BROKEN)
235  return e - s;
236 
237  if (rb_enc_to_index(enc) == 0) {
238  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
239  p = search_nonascii(p, e);
241  return e - s;
242  }
243  else if (rb_enc_asciicompat(enc)) {
244  p = search_nonascii(p, e);
245  if (!p) {
246  if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
247  return e - s;
248  }
249  while (p < e) {
250  int ret = rb_enc_precise_mbclen(p, e, enc);
251  if (!MBCLEN_CHARFOUND_P(ret)) {
253  return p - s;
254  }
255  p += MBCLEN_CHARFOUND_LEN(ret);
256  if (p < e) {
257  p = search_nonascii(p, e);
258  if (!p) {
259  *cr = ENC_CODERANGE_VALID;
260  return e - s;
261  }
262  }
263  }
265  return p - s;
266  }
267  else {
268  while (p < e) {
269  int ret = rb_enc_precise_mbclen(p, e, enc);
270  if (!MBCLEN_CHARFOUND_P(ret)) {
272  return p - s;
273  }
274  p += MBCLEN_CHARFOUND_LEN(ret);
275  }
277  return p - s;
278  }
279 }
280 
281 static inline void
283 {
284  rb_enc_set_index(str1, ENCODING_GET(str2));
285 }
286 
287 static void
289 {
290  /* this function is designed for copying encoding and coderange
291  * from src to new string "dest" which is made from the part of src.
292  */
293  str_enc_copy(dest, src);
294  switch (ENC_CODERANGE(src)) {
295  case ENC_CODERANGE_7BIT:
297  break;
298  case ENC_CODERANGE_VALID:
299  if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
302  else
304  break;
305  default:
306  if (RSTRING_LEN(dest) == 0) {
307  if (!rb_enc_asciicompat(STR_ENC_GET(src)))
309  else
311  }
312  break;
313  }
314 }
315 
316 static void
318 {
319  str_enc_copy(dest, src);
320  ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
321 }
322 
323 int
325 {
326  int cr = ENC_CODERANGE(str);
327 
328  if (cr == ENC_CODERANGE_UNKNOWN) {
329  rb_encoding *enc = STR_ENC_GET(str);
330  cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
331  ENC_CODERANGE_SET(str, cr);
332  }
333  return cr;
334 }
335 
336 int
338 {
339  rb_encoding *enc = STR_ENC_GET(str);
340 
341  if (!rb_enc_asciicompat(enc))
342  return FALSE;
343  else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
344  return TRUE;
345  return FALSE;
346 }
347 
348 static inline void
349 str_mod_check(VALUE s, const char *p, long len)
350 {
351  if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
352  rb_raise(rb_eRuntimeError, "string modified");
353  }
354 }
355 
356 size_t
358 {
359  if (STR_EMBED_P(str)) {
360  return RSTRING_EMBED_LEN_MAX;
361  }
362  else if (STR_NOCAPA_P(str)) {
363  return RSTRING(str)->as.heap.len;
364  }
365  else {
366  return RSTRING(str)->as.heap.aux.capa;
367  }
368 }
369 
370 static inline VALUE
372 {
373  NEWOBJ(str, struct RString);
374  OBJSETUP(str, klass, T_STRING);
375 
376  str->as.heap.ptr = 0;
377  str->as.heap.len = 0;
378  str->as.heap.aux.capa = 0;
379 
380  return (VALUE)str;
381 }
382 
383 static VALUE
384 str_new(VALUE klass, const char *ptr, long len)
385 {
386  VALUE str;
387 
388  if (len < 0) {
389  rb_raise(rb_eArgError, "negative string size (or size too big)");
390  }
391 
392  str = str_alloc(klass);
393  if (len > RSTRING_EMBED_LEN_MAX) {
394  RSTRING(str)->as.heap.aux.capa = len;
395  RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
396  STR_SET_NOEMBED(str);
397  }
398  else if (len == 0) {
400  }
401  if (ptr) {
402  memcpy(RSTRING_PTR(str), ptr, len);
403  }
404  STR_SET_LEN(str, len);
405  RSTRING_PTR(str)[len] = '\0';
406  return str;
407 }
408 
409 VALUE
410 rb_str_new(const char *ptr, long len)
411 {
412  return str_new(rb_cString, ptr, len);
413 }
414 
415 VALUE
416 rb_usascii_str_new(const char *ptr, long len)
417 {
418  VALUE str = rb_str_new(ptr, len);
420  return str;
421 }
422 
423 VALUE
424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
425 {
426  VALUE str = rb_str_new(ptr, len);
427  rb_enc_associate(str, enc);
428  return str;
429 }
430 
431 VALUE
432 rb_str_new_cstr(const char *ptr)
433 {
434  if (!ptr) {
435  rb_raise(rb_eArgError, "NULL pointer given");
436  }
437  return rb_str_new(ptr, strlen(ptr));
438 }
439 
441 #define rb_str_new2 rb_str_new_cstr
442 
443 VALUE
444 rb_usascii_str_new_cstr(const char *ptr)
445 {
446  VALUE str = rb_str_new2(ptr);
448  return str;
449 }
450 
452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
453 
454 VALUE
455 rb_tainted_str_new(const char *ptr, long len)
456 {
457  VALUE str = rb_str_new(ptr, len);
458 
459  OBJ_TAINT(str);
460  return str;
461 }
462 
463 VALUE
464 rb_tainted_str_new_cstr(const char *ptr)
465 {
466  VALUE str = rb_str_new2(ptr);
467 
468  OBJ_TAINT(str);
469  return str;
470 }
471 
473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
474 
475 VALUE
476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
477 {
478  rb_econv_t *ec;
479  rb_econv_result_t ret;
480  long len;
481  VALUE newstr;
482  const unsigned char *sp;
483  unsigned char *dp;
484 
485  if (!to) return str;
486  if (from == to) return str;
487  if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
488  to == rb_ascii8bit_encoding()) {
489  if (STR_ENC_GET(str) != to) {
490  str = rb_str_dup(str);
491  rb_enc_associate(str, to);
492  }
493  return str;
494  }
495 
496  len = RSTRING_LEN(str);
497  newstr = rb_str_new(0, len);
498 
499  retry:
500  ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
501  if (!ec) return str;
502 
503  sp = (unsigned char*)RSTRING_PTR(str);
504  dp = (unsigned char*)RSTRING_PTR(newstr);
505  ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
506  &dp, (unsigned char*)RSTRING_END(newstr), 0);
507  rb_econv_close(ec);
508  switch (ret) {
510  /* destination buffer short */
511  len = len < 2 ? 2 : len * 2;
512  rb_str_resize(newstr, len);
513  goto retry;
514 
515  case econv_finished:
516  len = dp - (unsigned char*)RSTRING_PTR(newstr);
517  rb_str_set_len(newstr, len);
518  rb_enc_associate(newstr, to);
519  return newstr;
520 
521  default:
522  /* some error, return original */
523  return str;
524  }
525 }
526 
527 VALUE
529 {
530  return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
531 }
532 
533 VALUE
534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
535 {
536  VALUE str;
537 
538  str = rb_tainted_str_new(ptr, len);
539  if (eenc == rb_usascii_encoding() &&
542  return str;
543  }
544  rb_enc_associate(str, eenc);
545  return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
546 }
547 
548 VALUE
549 rb_external_str_new(const char *ptr, long len)
550 {
552 }
553 
554 VALUE
555 rb_external_str_new_cstr(const char *ptr)
556 {
558 }
559 
560 VALUE
561 rb_locale_str_new(const char *ptr, long len)
562 {
564 }
565 
566 VALUE
567 rb_locale_str_new_cstr(const char *ptr)
568 {
570 }
571 
572 VALUE
573 rb_filesystem_str_new(const char *ptr, long len)
574 {
576 }
577 
578 VALUE
580 {
582 }
583 
584 VALUE
586 {
588 }
589 
590 VALUE
592 {
593  return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
594 }
595 
596 VALUE
598 {
599  return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
600 }
601 
602 static VALUE
604 {
605  if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
606  STR_SET_EMBED(str2);
607  memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
608  STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
609  }
610  else {
611  str = rb_str_new_frozen(str);
612  FL_SET(str2, STR_NOEMBED);
613  RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
614  RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
615  RSTRING(str2)->as.heap.aux.shared = str;
616  FL_SET(str2, ELTS_SHARED);
617  }
618  rb_enc_cr_str_exact_copy(str2, str);
619 
620  return str2;
621 }
622 
623 static VALUE
625 {
626  return str_replace_shared(str_alloc(klass), str);
627 }
628 
629 static VALUE
630 str_new3(VALUE klass, VALUE str)
631 {
632  return str_new_shared(klass, str);
633 }
634 
635 VALUE
637 {
638  VALUE str2 = str_new3(rb_obj_class(str), str);
639 
640  OBJ_INFECT(str2, str);
641  return str2;
642 }
643 
645 #define rb_str_new3 rb_str_new_shared
646 
647 static VALUE
648 str_new4(VALUE klass, VALUE str)
649 {
650  VALUE str2;
651 
652  str2 = str_alloc(klass);
653  STR_SET_NOEMBED(str2);
654  RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
655  RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
656  if (STR_SHARED_P(str)) {
657  VALUE shared = RSTRING(str)->as.heap.aux.shared;
658  assert(OBJ_FROZEN(shared));
659  FL_SET(str2, ELTS_SHARED);
660  RSTRING(str2)->as.heap.aux.shared = shared;
661  }
662  else {
663  FL_SET(str, ELTS_SHARED);
664  RSTRING(str)->as.heap.aux.shared = str2;
665  }
666  rb_enc_cr_str_exact_copy(str2, str);
667  OBJ_INFECT(str2, str);
668  return str2;
669 }
670 
671 VALUE
673 {
674  VALUE klass, str;
675 
676  if (OBJ_FROZEN(orig)) return orig;
677  klass = rb_obj_class(orig);
678  if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
679  long ofs;
680  assert(OBJ_FROZEN(str));
681  ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
682  if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
683  (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
684  ENCODING_GET(str) != ENCODING_GET(orig)) {
685  str = str_new3(klass, str);
686  RSTRING(str)->as.heap.ptr += ofs;
687  RSTRING(str)->as.heap.len -= ofs;
688  rb_enc_cr_str_exact_copy(str, orig);
689  OBJ_INFECT(str, orig);
690  }
691  }
692  else if (STR_EMBED_P(orig)) {
693  str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
694  rb_enc_cr_str_exact_copy(str, orig);
695  OBJ_INFECT(str, orig);
696  }
697  else if (STR_ASSOC_P(orig)) {
698  VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
699  FL_UNSET(orig, STR_ASSOC);
700  str = str_new4(klass, orig);
701  FL_SET(str, STR_ASSOC);
702  RSTRING(str)->as.heap.aux.shared = assoc;
703  }
704  else {
705  str = str_new4(klass, orig);
706  }
707  OBJ_FREEZE(str);
708  return str;
709 }
710 
712 #define rb_str_new4 rb_str_new_frozen
713 
714 VALUE
715 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
716 {
717  return str_new(rb_obj_class(obj), ptr, len);
718 }
719 
720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
721  rb_str_new_with_class, (obj, ptr, len))
722 #define rb_str_new5 rb_str_new_with_class
723 
724 static VALUE
725 str_new_empty(VALUE str)
726 {
727  VALUE v = rb_str_new5(str, 0, 0);
728  rb_enc_copy(v, str);
729  OBJ_INFECT(v, str);
730  return v;
731 }
732 
733 #define STR_BUF_MIN_SIZE 128
734 
735 VALUE
736 rb_str_buf_new(long capa)
737 {
738  VALUE str = str_alloc(rb_cString);
739 
740  if (capa < STR_BUF_MIN_SIZE) {
741  capa = STR_BUF_MIN_SIZE;
742  }
743  FL_SET(str, STR_NOEMBED);
744  RSTRING(str)->as.heap.aux.capa = capa;
745  RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
746  RSTRING(str)->as.heap.ptr[0] = '\0';
747 
748  return str;
749 }
750 
751 VALUE
752 rb_str_buf_new_cstr(const char *ptr)
753 {
754  VALUE str;
755  long len = strlen(ptr);
756 
757  str = rb_str_buf_new(len);
758  rb_str_buf_cat(str, ptr, len);
759 
760  return str;
761 }
762 
764 #define rb_str_buf_new2 rb_str_buf_new_cstr
765 
766 VALUE
767 rb_str_tmp_new(long len)
768 {
769  return str_new(0, 0, len);
770 }
771 
772 void *
773 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
774 {
775  VALUE s = rb_str_tmp_new(len);
776  *store = s;
777  return RSTRING_PTR(s);
778 }
779 
780 void
781 rb_free_tmp_buffer(volatile VALUE *store)
782 {
783  VALUE s = *store;
784  *store = 0;
785  if (s) rb_str_clear(s);
786 }
787 
788 void
790 {
791  if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
792  xfree(RSTRING(str)->as.heap.ptr);
793  }
794 }
795 
796 RUBY_FUNC_EXPORTED size_t
798 {
799  if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
800  return RSTRING(str)->as.heap.aux.capa;
801  }
802  else {
803  return 0;
804  }
805 }
806 
807 VALUE
809 {
810  return rb_convert_type(str, T_STRING, "String", "to_str");
811 }
812 
813 static inline void str_discard(VALUE str);
814 
815 void
817 {
818  rb_encoding *enc;
819  int cr;
820  if (str == str2) return;
821  enc = STR_ENC_GET(str2);
822  cr = ENC_CODERANGE(str2);
823  str_discard(str);
824  OBJ_INFECT(str, str2);
825  if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
826  STR_SET_EMBED(str);
827  memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
828  STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
829  rb_enc_associate(str, enc);
830  ENC_CODERANGE_SET(str, cr);
831  return;
832  }
833  STR_SET_NOEMBED(str);
834  STR_UNSET_NOCAPA(str);
835  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
836  RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
837  if (STR_NOCAPA_P(str2)) {
838  FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
839  RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
840  }
841  else {
842  RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
843  }
844  STR_SET_EMBED(str2); /* abandon str2 */
845  RSTRING_PTR(str2)[0] = 0;
846  STR_SET_EMBED_LEN(str2, 0);
847  rb_enc_associate(str, enc);
848  ENC_CODERANGE_SET(str, cr);
849 }
850 
851 static ID id_to_s;
852 
853 VALUE
855 {
856  VALUE str;
857 
858  if (TYPE(obj) == T_STRING) {
859  return obj;
860  }
861  str = rb_funcall(obj, id_to_s, 0);
862  if (TYPE(str) != T_STRING)
863  return rb_any_to_s(obj);
864  if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
865  return str;
866 }
867 
868 static VALUE
870 {
871  long len;
872 
873  len = RSTRING_LEN(str2);
874  if (STR_ASSOC_P(str2)) {
875  str2 = rb_str_new4(str2);
876  }
877  if (STR_SHARED_P(str2)) {
878  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
879  assert(OBJ_FROZEN(shared));
880  STR_SET_NOEMBED(str);
881  RSTRING(str)->as.heap.len = len;
882  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
883  FL_SET(str, ELTS_SHARED);
884  FL_UNSET(str, STR_ASSOC);
885  RSTRING(str)->as.heap.aux.shared = shared;
886  }
887  else {
888  str_replace_shared(str, str2);
889  }
890 
891  OBJ_INFECT(str, str2);
892  rb_enc_cr_str_exact_copy(str, str2);
893  return str;
894 }
895 
896 static VALUE
898 {
899  VALUE dup = str_alloc(klass);
900  str_replace(dup, str);
901  return dup;
902 }
903 
904 VALUE
906 {
907  return str_duplicate(rb_obj_class(str), str);
908 }
909 
910 VALUE
912 {
913  return str_replace(str_alloc(rb_cString), str);
914 }
915 
916 /*
917  * call-seq:
918  * String.new(str="") -> new_str
919  *
920  * Returns a new string object containing a copy of <i>str</i>.
921  */
922 
923 static VALUE
925 {
926  VALUE orig;
927 
928  if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
929  rb_str_replace(str, orig);
930  return str;
931 }
932 
933 static inline long
934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
935 {
936  long c;
937  const char *q;
938 
939  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
940  return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
941  }
942  else if (rb_enc_asciicompat(enc)) {
943  c = 0;
944  if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
945  while (p < e) {
946  if (ISASCII(*p)) {
947  q = search_nonascii(p, e);
948  if (!q)
949  return c + (e - p);
950  c += q - p;
951  p = q;
952  }
953  p += rb_enc_fast_mbclen(p, e, enc);
954  c++;
955  }
956  }
957  else {
958  while (p < e) {
959  if (ISASCII(*p)) {
960  q = search_nonascii(p, e);
961  if (!q)
962  return c + (e - p);
963  c += q - p;
964  p = q;
965  }
966  p += rb_enc_mbclen(p, e, enc);
967  c++;
968  }
969  }
970  return c;
971  }
972 
973  for (c=0; p<e; c++) {
974  p += rb_enc_mbclen(p, e, enc);
975  }
976  return c;
977 }
978 
979 long
980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
981 {
982  return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
983 }
984 
985 long
986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
987 {
988  long c;
989  const char *q;
990  int ret;
991 
992  *cr = 0;
993  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
994  return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
995  }
996  else if (rb_enc_asciicompat(enc)) {
997  c = 0;
998  while (p < e) {
999  if (ISASCII(*p)) {
1000  q = search_nonascii(p, e);
1001  if (!q) {
1002  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1003  return c + (e - p);
1004  }
1005  c += q - p;
1006  p = q;
1007  }
1008  ret = rb_enc_precise_mbclen(p, e, enc);
1009  if (MBCLEN_CHARFOUND_P(ret)) {
1010  *cr |= ENC_CODERANGE_VALID;
1011  p += MBCLEN_CHARFOUND_LEN(ret);
1012  }
1013  else {
1014  *cr = ENC_CODERANGE_BROKEN;
1015  p++;
1016  }
1017  c++;
1018  }
1019  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1020  return c;
1021  }
1022 
1023  for (c=0; p<e; c++) {
1024  ret = rb_enc_precise_mbclen(p, e, enc);
1025  if (MBCLEN_CHARFOUND_P(ret)) {
1026  *cr |= ENC_CODERANGE_VALID;
1027  p += MBCLEN_CHARFOUND_LEN(ret);
1028  }
1029  else {
1030  *cr = ENC_CODERANGE_BROKEN;
1031  if (p + rb_enc_mbminlen(enc) <= e)
1032  p += rb_enc_mbminlen(enc);
1033  else
1034  p = e;
1035  }
1036  }
1037  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1038  return c;
1039 }
1040 
1041 #ifdef NONASCII_MASK
1042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1043 
1044 /*
1045  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1046  * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
1047  * Therefore, following pseudo code can detect UTF-8 leading byte.
1048  *
1049  * if (!(byte & 0x80))
1050  * byte |= 0x40; // turn on bit6
1051  * return ((byte>>6) & 1); // bit6 represent it's leading byte or not.
1052  *
1053  * This function calculate every bytes in the argument word `s'
1054  * using the above logic concurrently. and gather every bytes result.
1055  */
1056 static inline VALUE
1057 count_utf8_lead_bytes_with_word(const VALUE *s)
1058 {
1059  VALUE d = *s;
1060 
1061  /* Transform into bit0 represent UTF-8 leading or not. */
1062  d |= ~(d>>1);
1063  d >>= 6;
1064  d &= NONASCII_MASK >> 7;
1065 
1066  /* Gather every bytes. */
1067  d += (d>>8);
1068  d += (d>>16);
1069 #if SIZEOF_VALUE == 8
1070  d += (d>>32);
1071 #endif
1072  return (d&0xF);
1073 }
1074 #endif
1075 
1076 static long
1078 {
1079  const char *p, *e;
1080  long n;
1081  int cr;
1082 
1083  if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1084  if (!enc) enc = STR_ENC_GET(str);
1085  p = RSTRING_PTR(str);
1086  e = RSTRING_END(str);
1087  cr = ENC_CODERANGE(str);
1088 #ifdef NONASCII_MASK
1089  if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1090  enc == rb_utf8_encoding()) {
1091 
1092  VALUE len = 0;
1093  if ((int)sizeof(VALUE) * 2 < e - p) {
1094  const VALUE *s, *t;
1095  const VALUE lowbits = sizeof(VALUE) - 1;
1096  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1097  t = (const VALUE*)(~lowbits & (VALUE)e);
1098  while (p < (const char *)s) {
1099  if (is_utf8_lead_byte(*p)) len++;
1100  p++;
1101  }
1102  while (s < t) {
1103  len += count_utf8_lead_bytes_with_word(s);
1104  s++;
1105  }
1106  p = (const char *)s;
1107  }
1108  while (p < e) {
1109  if (is_utf8_lead_byte(*p)) len++;
1110  p++;
1111  }
1112  return (long)len;
1113  }
1114 #endif
1115  n = rb_enc_strlen_cr(p, e, enc, &cr);
1116  if (cr) {
1117  ENC_CODERANGE_SET(str, cr);
1118  }
1119  return n;
1120 }
1121 
1122 long
1124 {
1125  return str_strlen(str, STR_ENC_GET(str));
1126 }
1127 
1128 /*
1129  * call-seq:
1130  * str.length -> integer
1131  * str.size -> integer
1132  *
1133  * Returns the character length of <i>str</i>.
1134  */
1135 
1136 VALUE
1138 {
1139  long len;
1140 
1141  len = str_strlen(str, STR_ENC_GET(str));
1142  return LONG2NUM(len);
1143 }
1144 
1145 /*
1146  * call-seq:
1147  * str.bytesize -> integer
1148  *
1149  * Returns the length of <i>str</i> in bytes.
1150  */
1151 
1152 static VALUE
1154 {
1155  return LONG2NUM(RSTRING_LEN(str));
1156 }
1157 
1158 /*
1159  * call-seq:
1160  * str.empty? -> true or false
1161  *
1162  * Returns <code>true</code> if <i>str</i> has a length of zero.
1163  *
1164  * "hello".empty? #=> false
1165  * "".empty? #=> true
1166  */
1167 
1168 static VALUE
1170 {
1171  if (RSTRING_LEN(str) == 0)
1172  return Qtrue;
1173  return Qfalse;
1174 }
1175 
1176 /*
1177  * call-seq:
1178  * str + other_str -> new_str
1179  *
1180  * Concatenation---Returns a new <code>String</code> containing
1181  * <i>other_str</i> concatenated to <i>str</i>.
1182  *
1183  * "Hello from " + self.to_s #=> "Hello from main"
1184  */
1185 
1186 VALUE
1188 {
1189  VALUE str3;
1190  rb_encoding *enc;
1191 
1192  StringValue(str2);
1193  enc = rb_enc_check(str1, str2);
1194  str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
1195  memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
1196  memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
1197  RSTRING_PTR(str2), RSTRING_LEN(str2));
1198  RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
1199 
1200  if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
1201  OBJ_TAINT(str3);
1204  return str3;
1205 }
1206 
1207 /*
1208  * call-seq:
1209  * str * integer -> new_str
1210  *
1211  * Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
1212  * the receiver.
1213  *
1214  * "Ho! " * 3 #=> "Ho! Ho! Ho! "
1215  */
1216 
1217 VALUE
1219 {
1220  VALUE str2;
1221  long n, len;
1222  char *ptr2;
1223 
1224  len = NUM2LONG(times);
1225  if (len < 0) {
1226  rb_raise(rb_eArgError, "negative argument");
1227  }
1228  if (len && LONG_MAX/len < RSTRING_LEN(str)) {
1229  rb_raise(rb_eArgError, "argument too big");
1230  }
1231 
1232  str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
1233  ptr2 = RSTRING_PTR(str2);
1234  if (len) {
1235  n = RSTRING_LEN(str);
1236  memcpy(ptr2, RSTRING_PTR(str), n);
1237  while (n <= len/2) {
1238  memcpy(ptr2 + n, ptr2, n);
1239  n *= 2;
1240  }
1241  memcpy(ptr2 + n, ptr2, len-n);
1242  }
1243  ptr2[RSTRING_LEN(str2)] = '\0';
1244  OBJ_INFECT(str2, str);
1245  rb_enc_cr_str_copy_for_substr(str2, str);
1246 
1247  return str2;
1248 }
1249 
1250 /*
1251  * call-seq:
1252  * str % arg -> new_str
1253  *
1254  * Format---Uses <i>str</i> as a format specification, and returns the result
1255  * of applying it to <i>arg</i>. If the format specification contains more than
1256  * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
1257  * containing the values to be substituted. See <code>Kernel::sprintf</code> for
1258  * details of the format string.
1259  *
1260  * "%05d" % 123 #=> "00123"
1261  * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6"
1262  * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
1263  */
1264 
1265 static VALUE
1267 {
1268  volatile VALUE tmp = rb_check_array_type(arg);
1269 
1270  if (!NIL_P(tmp)) {
1271  return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
1272  }
1273  return rb_str_format(1, &arg, str);
1274 }
1275 
1276 static inline void
1278 {
1279  if (FL_TEST(str, STR_TMPLOCK)) {
1280  rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1281  }
1282  rb_check_frozen(str);
1283  if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
1284  rb_raise(rb_eSecurityError, "Insecure: can't modify string");
1285 }
1286 
1287 static inline int
1289 {
1290  str_modifiable(str);
1291  if (!STR_SHARED_P(str)) return 1;
1292  if (STR_EMBED_P(str)) return 1;
1293  return 0;
1294 }
1295 
1296 static void
1298 {
1299  char *ptr;
1300  long len = RSTRING_LEN(str);
1301  long capa = len + expand;
1302 
1303  if (len > capa) len = capa;
1304  ptr = ALLOC_N(char, capa + 1);
1305  if (RSTRING_PTR(str)) {
1306  memcpy(ptr, RSTRING_PTR(str), len);
1307  }
1308  STR_SET_NOEMBED(str);
1309  STR_UNSET_NOCAPA(str);
1310  ptr[len] = 0;
1311  RSTRING(str)->as.heap.ptr = ptr;
1312  RSTRING(str)->as.heap.len = len;
1313  RSTRING(str)->as.heap.aux.capa = capa;
1314 }
1315 
1316 #define str_make_independent(str) str_make_independent_expand((str), 0L)
1317 
1318 void
1320 {
1321  if (!str_independent(str))
1322  str_make_independent(str);
1323  ENC_CODERANGE_CLEAR(str);
1324 }
1325 
1326 void
1327 rb_str_modify_expand(VALUE str, long expand)
1328 {
1329  if (expand < 0) {
1330  rb_raise(rb_eArgError, "negative expanding string size");
1331  }
1332  if (!str_independent(str)) {
1333  str_make_independent_expand(str, expand);
1334  }
1335  else if (expand > 0) {
1336  long len = RSTRING_LEN(str);
1337  long capa = len + expand;
1338  if (!STR_EMBED_P(str)) {
1339  REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
1340  RSTRING(str)->as.heap.aux.capa = capa;
1341  }
1342  else if (capa > RSTRING_EMBED_LEN_MAX) {
1343  str_make_independent_expand(str, expand);
1344  }
1345  }
1346  ENC_CODERANGE_CLEAR(str);
1347 }
1348 
1349 /* As rb_str_modify(), but don't clear coderange */
1350 static void
1352 {
1353  if (!str_independent(str))
1354  str_make_independent(str);
1355  if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
1356  /* Force re-scan later */
1357  ENC_CODERANGE_CLEAR(str);
1358 }
1359 
1360 static inline void
1362 {
1363  str_modifiable(str);
1364  if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
1365  xfree(RSTRING_PTR(str));
1366  RSTRING(str)->as.heap.ptr = 0;
1367  RSTRING(str)->as.heap.len = 0;
1368  }
1369 }
1370 
1371 void
1373 {
1374  /* sanity check */
1375  rb_check_frozen(str);
1376  if (STR_ASSOC_P(str)) {
1377  /* already associated */
1378  rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1379  }
1380  else {
1381  if (STR_SHARED_P(str)) {
1382  VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1383  str_make_independent(str);
1384  if (STR_ASSOC_P(assoc)) {
1385  assoc = RSTRING(assoc)->as.heap.aux.shared;
1386  rb_ary_concat(assoc, add);
1387  add = assoc;
1388  }
1389  }
1390  else if (STR_EMBED_P(str)) {
1391  str_make_independent(str);
1392  }
1393  else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1394  RESIZE_CAPA(str, RSTRING_LEN(str));
1395  }
1396  FL_SET(str, STR_ASSOC);
1397  RBASIC(add)->klass = 0;
1398  RSTRING(str)->as.heap.aux.shared = add;
1399  }
1400 }
1401 
1402 VALUE
1404 {
1405  if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1406  if (STR_ASSOC_P(str)) {
1407  return RSTRING(str)->as.heap.aux.shared;
1408  }
1409  return Qfalse;
1410 }
1411 
1412 VALUE
1413 rb_string_value(volatile VALUE *ptr)
1414 {
1415  VALUE s = *ptr;
1416  if (TYPE(s) != T_STRING) {
1417  s = rb_str_to_str(s);
1418  *ptr = s;
1419  }
1420  return s;
1421 }
1422 
1423 char *
1425 {
1426  VALUE str = rb_string_value(ptr);
1427  return RSTRING_PTR(str);
1428 }
1429 
1430 char *
1432 {
1433  VALUE str = rb_string_value(ptr);
1434  char *s = RSTRING_PTR(str);
1435  long len = RSTRING_LEN(str);
1436 
1437  if (!s || memchr(s, 0, len)) {
1438  rb_raise(rb_eArgError, "string contains null byte");
1439  }
1440  if (s[len]) {
1441  rb_str_modify(str);
1442  s = RSTRING_PTR(str);
1443  s[RSTRING_LEN(str)] = 0;
1444  }
1445  return s;
1446 }
1447 
1448 VALUE
1450 {
1451  str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1452  return str;
1453 }
1454 
1455 /*
1456  * call-seq:
1457  * String.try_convert(obj) -> string or nil
1458  *
1459  * Try to convert <i>obj</i> into a String, using to_str method.
1460  * Returns converted string or nil if <i>obj</i> cannot be converted
1461  * for any reason.
1462  *
1463  * String.try_convert("str") #=> "str"
1464  * String.try_convert(/re/) #=> nil
1465  */
1466 static VALUE
1468 {
1469  return rb_check_string_type(str);
1470 }
1471 
1472 static char*
1473 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
1474 {
1475  long nth = *nthp;
1476  if (rb_enc_mbmaxlen(enc) == 1) {
1477  p += nth;
1478  }
1479  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1480  p += nth * rb_enc_mbmaxlen(enc);
1481  }
1482  else if (rb_enc_asciicompat(enc)) {
1483  const char *p2, *e2;
1484  int n;
1485 
1486  while (p < e && 0 < nth) {
1487  e2 = p + nth;
1488  if (e < e2) {
1489  *nthp = nth;
1490  return (char *)e;
1491  }
1492  if (ISASCII(*p)) {
1493  p2 = search_nonascii(p, e2);
1494  if (!p2) {
1495  *nthp = nth;
1496  return (char *)e2;
1497  }
1498  nth -= p2 - p;
1499  p = p2;
1500  }
1501  n = rb_enc_mbclen(p, e, enc);
1502  p += n;
1503  nth--;
1504  }
1505  *nthp = nth;
1506  if (nth != 0) {
1507  return (char *)e;
1508  }
1509  return (char *)p;
1510  }
1511  else {
1512  while (p < e && nth--) {
1513  p += rb_enc_mbclen(p, e, enc);
1514  }
1515  }
1516  if (p > e) p = e;
1517  *nthp = nth;
1518  return (char*)p;
1519 }
1520 
1521 char*
1522 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
1523 {
1524  return str_nth_len(p, e, &nth, enc);
1525 }
1526 
1527 static char*
1528 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1529 {
1530  if (singlebyte)
1531  p += nth;
1532  else {
1533  p = str_nth_len(p, e, &nth, enc);
1534  }
1535  if (!p) return 0;
1536  if (p > e) p = e;
1537  return (char *)p;
1538 }
1539 
1540 /* char offset to byte offset */
1541 static long
1542 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1543 {
1544  const char *pp = str_nth(p, e, nth, enc, singlebyte);
1545  if (!pp) return e - p;
1546  return pp - p;
1547 }
1548 
1549 long
1550 rb_str_offset(VALUE str, long pos)
1551 {
1552  return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
1554 }
1555 
1556 #ifdef NONASCII_MASK
1557 static char *
1558 str_utf8_nth(const char *p, const char *e, long *nthp)
1559 {
1560  long nth = *nthp;
1561  if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
1562  const VALUE *s, *t;
1563  const VALUE lowbits = sizeof(VALUE) - 1;
1564  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1565  t = (const VALUE*)(~lowbits & (VALUE)e);
1566  while (p < (const char *)s) {
1567  if (is_utf8_lead_byte(*p)) nth--;
1568  p++;
1569  }
1570  do {
1571  nth -= count_utf8_lead_bytes_with_word(s);
1572  s++;
1573  } while (s < t && (int)sizeof(VALUE) <= nth);
1574  p = (char *)s;
1575  }
1576  while (p < e) {
1577  if (is_utf8_lead_byte(*p)) {
1578  if (nth == 0) break;
1579  nth--;
1580  }
1581  p++;
1582  }
1583  *nthp = nth;
1584  return (char *)p;
1585 }
1586 
1587 static long
1588 str_utf8_offset(const char *p, const char *e, long nth)
1589 {
1590  const char *pp = str_utf8_nth(p, e, &nth);
1591  return pp - p;
1592 }
1593 #endif
1594 
1595 /* byte offset to char offset */
1596 long
1597 rb_str_sublen(VALUE str, long pos)
1598 {
1599  if (single_byte_optimizable(str) || pos < 0)
1600  return pos;
1601  else {
1602  char *p = RSTRING_PTR(str);
1603  return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
1604  }
1605 }
1606 
1607 VALUE
1608 rb_str_subseq(VALUE str, long beg, long len)
1609 {
1610  VALUE str2;
1611 
1612  if (RSTRING_LEN(str) == beg + len &&
1613  RSTRING_EMBED_LEN_MAX < len) {
1614  str2 = rb_str_new_shared(rb_str_new_frozen(str));
1615  rb_str_drop_bytes(str2, beg);
1616  }
1617  else {
1618  str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1619  }
1620 
1621  rb_enc_cr_str_copy_for_substr(str2, str);
1622  OBJ_INFECT(str2, str);
1623 
1624  return str2;
1625 }
1626 
1627 VALUE
1628 rb_str_substr(VALUE str, long beg, long len)
1629 {
1630  rb_encoding *enc = STR_ENC_GET(str);
1631  VALUE str2;
1632  char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
1633 
1634  if (len < 0) return Qnil;
1635  if (!RSTRING_LEN(str)) {
1636  len = 0;
1637  }
1638  if (single_byte_optimizable(str)) {
1639  if (beg > RSTRING_LEN(str)) return Qnil;
1640  if (beg < 0) {
1641  beg += RSTRING_LEN(str);
1642  if (beg < 0) return Qnil;
1643  }
1644  if (beg + len > RSTRING_LEN(str))
1645  len = RSTRING_LEN(str) - beg;
1646  if (len <= 0) {
1647  len = 0;
1648  p = 0;
1649  }
1650  else
1651  p = s + beg;
1652  goto sub;
1653  }
1654  if (beg < 0) {
1655  if (len > -beg) len = -beg;
1656  if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1657  beg = -beg;
1658  while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
1659  p = e;
1660  if (!p) return Qnil;
1661  while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
1662  if (!p) return Qnil;
1663  len = e - p;
1664  goto sub;
1665  }
1666  else {
1667  beg += str_strlen(str, enc);
1668  if (beg < 0) return Qnil;
1669  }
1670  }
1671  else if (beg > 0 && beg > RSTRING_LEN(str)) {
1672  return Qnil;
1673  }
1674  if (len == 0) {
1675  if (beg > str_strlen(str, enc)) return Qnil;
1676  p = 0;
1677  }
1678 #ifdef NONASCII_MASK
1679  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1680  enc == rb_utf8_encoding()) {
1681  p = str_utf8_nth(s, e, &beg);
1682  if (beg > 0) return Qnil;
1683  len = str_utf8_offset(p, e, len);
1684  }
1685 #endif
1686  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1687  int char_sz = rb_enc_mbmaxlen(enc);
1688 
1689  p = s + beg * char_sz;
1690  if (p > e) {
1691  return Qnil;
1692  }
1693  else if (len * char_sz > e - p)
1694  len = e - p;
1695  else
1696  len *= char_sz;
1697  }
1698  else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
1699  if (beg > 0) return Qnil;
1700  len = 0;
1701  }
1702  else {
1703  len = str_offset(p, e, len, enc, 0);
1704  }
1705  sub:
1706  if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
1707  str2 = rb_str_new4(str);
1708  str2 = str_new3(rb_obj_class(str2), str2);
1709  RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1710  RSTRING(str2)->as.heap.len = len;
1711  }
1712  else {
1713  str2 = rb_str_new5(str, p, len);
1714  rb_enc_cr_str_copy_for_substr(str2, str);
1715  OBJ_INFECT(str2, str);
1716  }
1717 
1718  return str2;
1719 }
1720 
1721 VALUE
1723 {
1724  if (STR_ASSOC_P(str)) {
1725  VALUE ary = RSTRING(str)->as.heap.aux.shared;
1726  OBJ_FREEZE(ary);
1727  }
1728  return rb_obj_freeze(str);
1729 }
1730 
1732 #define rb_str_dup_frozen rb_str_new_frozen
1733 
1734 VALUE
1735 rb_str_locktmp(VALUE str)
1736 {
1737  if (FL_TEST(str, STR_TMPLOCK)) {
1738  rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1739  }
1740  FL_SET(str, STR_TMPLOCK);
1741  return str;
1742 }
1743 
1744 VALUE
1746 {
1747  if (!FL_TEST(str, STR_TMPLOCK)) {
1748  rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1749  }
1750  FL_UNSET(str, STR_TMPLOCK);
1751  return str;
1752 }
1753 
1754 VALUE
1756 {
1757  rb_str_locktmp(str);
1758  return rb_ensure(func, arg, rb_str_unlocktmp, str);
1759 }
1760 
1761 void
1762 rb_str_set_len(VALUE str, long len)
1763 {
1764  long capa;
1765 
1766  str_modifiable(str);
1767  if (STR_SHARED_P(str)) {
1768  rb_raise(rb_eRuntimeError, "can't set length of shared string");
1769  }
1770  if (len > (capa = (long)rb_str_capacity(str))) {
1771  rb_bug("probable buffer overflow: %ld for %ld", len, capa);
1772  }
1773  STR_SET_LEN(str, len);
1774  RSTRING_PTR(str)[len] = '\0';
1775 }
1776 
1777 VALUE
1778 rb_str_resize(VALUE str, long len)
1779 {
1780  long slen;
1781  int independent;
1782 
1783  if (len < 0) {
1784  rb_raise(rb_eArgError, "negative string size (or size too big)");
1785  }
1786 
1787  independent = str_independent(str);
1788  ENC_CODERANGE_CLEAR(str);
1789  slen = RSTRING_LEN(str);
1790  if (len != slen) {
1791  if (STR_EMBED_P(str)) {
1792  if (len <= RSTRING_EMBED_LEN_MAX) {
1793  STR_SET_EMBED_LEN(str, len);
1794  RSTRING(str)->as.ary[len] = '\0';
1795  return str;
1796  }
1797  str_make_independent_expand(str, len - slen);
1798  STR_SET_NOEMBED(str);
1799  }
1800  else if (len <= RSTRING_EMBED_LEN_MAX) {
1801  char *ptr = RSTRING(str)->as.heap.ptr;
1802  STR_SET_EMBED(str);
1803  if (slen > len) slen = len;
1804  if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
1805  RSTRING(str)->as.ary[len] = '\0';
1806  STR_SET_EMBED_LEN(str, len);
1807  if (independent) xfree(ptr);
1808  return str;
1809  }
1810  else if (!independent) {
1811  str_make_independent_expand(str, len - slen);
1812  }
1813  else if (slen < len || slen - len > 1024) {
1814  REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1815  }
1816  if (!STR_NOCAPA_P(str)) {
1817  RSTRING(str)->as.heap.aux.capa = len;
1818  }
1819  RSTRING(str)->as.heap.len = len;
1820  RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */
1821  }
1822  return str;
1823 }
1824 
1825 static VALUE
1826 str_buf_cat(VALUE str, const char *ptr, long len)
1827 {
1828  long capa, total, off = -1;
1829 
1830  if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1831  off = ptr - RSTRING_PTR(str);
1832  }
1833  rb_str_modify(str);
1834  if (len == 0) return 0;
1835  if (STR_ASSOC_P(str)) {
1836  FL_UNSET(str, STR_ASSOC);
1837  capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1838  }
1839  else if (STR_EMBED_P(str)) {
1840  capa = RSTRING_EMBED_LEN_MAX;
1841  }
1842  else {
1843  capa = RSTRING(str)->as.heap.aux.capa;
1844  }
1845  if (RSTRING_LEN(str) >= LONG_MAX - len) {
1846  rb_raise(rb_eArgError, "string sizes too big");
1847  }
1848  total = RSTRING_LEN(str)+len;
1849  if (capa <= total) {
1850  while (total > capa) {
1851  if (capa + 1 >= LONG_MAX / 2) {
1852  capa = (total + 4095) / 4096;
1853  break;
1854  }
1855  capa = (capa + 1) * 2;
1856  }
1857  RESIZE_CAPA(str, capa);
1858  }
1859  if (off != -1) {
1860  ptr = RSTRING_PTR(str) + off;
1861  }
1862  memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1863  STR_SET_LEN(str, total);
1864  RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1865 
1866  return str;
1867 }
1868 
1869 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
1870 
1871 VALUE
1872 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1873 {
1874  if (len == 0) return str;
1875  if (len < 0) {
1876  rb_raise(rb_eArgError, "negative string size (or size too big)");
1877  }
1878  return str_buf_cat(str, ptr, len);
1879 }
1880 
1881 VALUE
1882 rb_str_buf_cat2(VALUE str, const char *ptr)
1883 {
1884  return rb_str_buf_cat(str, ptr, strlen(ptr));
1885 }
1886 
1887 VALUE
1888 rb_str_cat(VALUE str, const char *ptr, long len)
1889 {
1890  if (len < 0) {
1891  rb_raise(rb_eArgError, "negative string size (or size too big)");
1892  }
1893  if (STR_ASSOC_P(str)) {
1894  char *p;
1895  rb_str_modify_expand(str, len);
1896  p = RSTRING(str)->as.heap.ptr;
1897  memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
1898  len = RSTRING(str)->as.heap.len += len;
1899  p[len] = '\0'; /* sentinel */
1900  return str;
1901  }
1902 
1903  return rb_str_buf_cat(str, ptr, len);
1904 }
1905 
1906 VALUE
1907 rb_str_cat2(VALUE str, const char *ptr)
1908 {
1909  return rb_str_cat(str, ptr, strlen(ptr));
1910 }
1911 
1912 static VALUE
1913 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1914  int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1915 {
1916  int str_encindex = ENCODING_GET(str);
1917  int res_encindex;
1918  int str_cr, res_cr;
1919 
1920  str_cr = ENC_CODERANGE(str);
1921 
1922  if (str_encindex == ptr_encindex) {
1923  if (str_cr == ENC_CODERANGE_UNKNOWN)
1924  ptr_cr = ENC_CODERANGE_UNKNOWN;
1925  else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1926  ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1927  }
1928  }
1929  else {
1930  rb_encoding *str_enc = rb_enc_from_index(str_encindex);
1931  rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
1932  if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
1933  if (len == 0)
1934  return str;
1935  if (RSTRING_LEN(str) == 0) {
1936  rb_str_buf_cat(str, ptr, len);
1937  ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
1938  return str;
1939  }
1940  goto incompatible;
1941  }
1942  if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1943  ptr_cr = coderange_scan(ptr, len, ptr_enc);
1944  }
1945  if (str_cr == ENC_CODERANGE_UNKNOWN) {
1946  if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
1947  str_cr = rb_enc_str_coderange(str);
1948  }
1949  }
1950  }
1951  if (ptr_cr_ret)
1952  *ptr_cr_ret = ptr_cr;
1953 
1954  if (str_encindex != ptr_encindex &&
1955  str_cr != ENC_CODERANGE_7BIT &&
1956  ptr_cr != ENC_CODERANGE_7BIT) {
1957  incompatible:
1958  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
1959  rb_enc_name(rb_enc_from_index(str_encindex)),
1960  rb_enc_name(rb_enc_from_index(ptr_encindex)));
1961  }
1962 
1963  if (str_cr == ENC_CODERANGE_UNKNOWN) {
1964  res_encindex = str_encindex;
1965  res_cr = ENC_CODERANGE_UNKNOWN;
1966  }
1967  else if (str_cr == ENC_CODERANGE_7BIT) {
1968  if (ptr_cr == ENC_CODERANGE_7BIT) {
1969  res_encindex = str_encindex;
1970  res_cr = ENC_CODERANGE_7BIT;
1971  }
1972  else {
1973  res_encindex = ptr_encindex;
1974  res_cr = ptr_cr;
1975  }
1976  }
1977  else if (str_cr == ENC_CODERANGE_VALID) {
1978  res_encindex = str_encindex;
1979  if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
1980  res_cr = str_cr;
1981  else
1982  res_cr = ptr_cr;
1983  }
1984  else { /* str_cr == ENC_CODERANGE_BROKEN */
1985  res_encindex = str_encindex;
1986  res_cr = str_cr;
1987  if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
1988  }
1989 
1990  if (len < 0) {
1991  rb_raise(rb_eArgError, "negative string size (or size too big)");
1992  }
1993  str_buf_cat(str, ptr, len);
1994  ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1995  return str;
1996 }
1997 
1998 VALUE
1999 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2000 {
2001  return rb_enc_cr_str_buf_cat(str, ptr, len,
2003 }
2004 
2005 VALUE
2006 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2007 {
2008  /* ptr must reference NUL terminated ASCII string. */
2009  int encindex = ENCODING_GET(str);
2010  rb_encoding *enc = rb_enc_from_index(encindex);
2011  if (rb_enc_asciicompat(enc)) {
2012  return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2013  encindex, ENC_CODERANGE_7BIT, 0);
2014  }
2015  else {
2016  char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2017  while (*ptr) {
2018  unsigned int c = (unsigned char)*ptr;
2019  int len = rb_enc_codelen(c, enc);
2020  rb_enc_mbcput(c, buf, enc);
2021  rb_enc_cr_str_buf_cat(str, buf, len,
2022  encindex, ENC_CODERANGE_VALID, 0);
2023  ptr++;
2024  }
2025  return str;
2026  }
2027 }
2028 
2029 VALUE
2031 {
2032  int str2_cr;
2033 
2034  str2_cr = ENC_CODERANGE(str2);
2035 
2036  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2037  ENCODING_GET(str2), str2_cr, &str2_cr);
2038 
2039  OBJ_INFECT(str, str2);
2040  ENC_CODERANGE_SET(str2, str2_cr);
2041 
2042  return str;
2043 }
2044 
2045 VALUE
2047 {
2048  rb_encoding *enc;
2049  int cr, cr2;
2050  long len2;
2051 
2052  StringValue(str2);
2053  if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
2054  long len = RSTRING_LEN(str) + len2;
2055  enc = rb_enc_check(str, str2);
2056  cr = ENC_CODERANGE(str);
2057  if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
2058  rb_str_modify_expand(str, len2);
2059  memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
2060  RSTRING_PTR(str2), len2+1);
2061  RSTRING(str)->as.heap.len = len;
2062  rb_enc_associate(str, enc);
2063  ENC_CODERANGE_SET(str, cr);
2064  OBJ_INFECT(str, str2);
2065  return str;
2066  }
2067  return rb_str_buf_append(str, str2);
2068 }
2069 
2070 /*
2071  * call-seq:
2072  * str << integer -> str
2073  * str.concat(integer) -> str
2074  * str << obj -> str
2075  * str.concat(obj) -> str
2076  *
2077  * Append---Concatenates the given object to <i>str</i>. If the object is a
2078  * <code>Integer</code>, it is considered as a codepoint, and is converted
2079  * to a character before concatenation.
2080  *
2081  * a = "hello "
2082  * a << "world" #=> "hello world"
2083  * a.concat(33) #=> "hello world!"
2084  */
2085 
2086 VALUE
2088 {
2089  unsigned int code;
2090  rb_encoding *enc = STR_ENC_GET(str1);
2091 
2092  if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
2093  if (rb_num_to_uint(str2, &code) == 0) {
2094  }
2095  else if (FIXNUM_P(str2)) {
2096  rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
2097  }
2098  else {
2099  rb_raise(rb_eRangeError, "bignum out of char range");
2100  }
2101  }
2102  else {
2103  return rb_str_append(str1, str2);
2104  }
2105 
2106  if (enc == rb_usascii_encoding()) {
2107  /* US-ASCII automatically extended to ASCII-8BIT */
2108  char buf[1];
2109  buf[0] = (char)code;
2110  if (code > 0xFF) {
2111  rb_raise(rb_eRangeError, "%u out of char range", code);
2112  }
2113  rb_str_cat(str1, buf, 1);
2114  if (code > 127) {
2117  }
2118  }
2119  else {
2120  long pos = RSTRING_LEN(str1);
2121  int cr = ENC_CODERANGE(str1);
2122  int len;
2123  char *buf;
2124 
2125  switch (len = rb_enc_codelen(code, enc)) {
2127  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2128  break;
2130  case 0:
2131  rb_raise(rb_eRangeError, "%u out of char range", code);
2132  break;
2133  }
2134  buf = ALLOCA_N(char, len + 1);
2135  rb_enc_mbcput(code, buf, enc);
2136  if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
2137  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2138  }
2139  rb_str_resize(str1, pos+len);
2140  strncpy(RSTRING_PTR(str1) + pos, buf, len);
2141  if (cr == ENC_CODERANGE_7BIT && code > 127)
2142  cr = ENC_CODERANGE_VALID;
2143  ENC_CODERANGE_SET(str1, cr);
2144  }
2145  return str1;
2146 }
2147 
2148 /*
2149  * call-seq:
2150  * str.prepend(other_str) -> str
2151  *
2152  * Prepend---Prepend the given string to <i>str</i>.
2153  *
2154  * a = "world"
2155  * a.prepend("hello ") #=> "hello world"
2156  * a #=> "hello world"
2157  */
2158 
2159 static VALUE
2161 {
2162  StringValue(str2);
2163  StringValue(str);
2164  rb_str_update(str, 0L, 0L, str2);
2165  return str;
2166 }
2167 
2168 st_index_t
2170 {
2171  int e = ENCODING_GET(str);
2172  if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
2173  e = 0;
2174  }
2175  return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
2176 }
2177 
2178 int
2180 {
2181  long len;
2182 
2183  if (!rb_str_comparable(str1, str2)) return 1;
2184  if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
2185  memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
2186  return 0;
2187  }
2188  return 1;
2189 }
2190 
2191 /*
2192  * call-seq:
2193  * str.hash -> fixnum
2194  *
2195  * Return a hash based on the string's length and content.
2196  */
2197 
2198 static VALUE
2200 {
2201  st_index_t hval = rb_str_hash(str);
2202  return INT2FIX(hval);
2203 }
2204 
2205 #define lesser(a,b) (((a)>(b))?(b):(a))
2206 
2207 int
2209 {
2210  int idx1, idx2;
2211  int rc1, rc2;
2212 
2213  if (RSTRING_LEN(str1) == 0) return TRUE;
2214  if (RSTRING_LEN(str2) == 0) return TRUE;
2215  idx1 = ENCODING_GET(str1);
2216  idx2 = ENCODING_GET(str2);
2217  if (idx1 == idx2) return TRUE;
2218  rc1 = rb_enc_str_coderange(str1);
2219  rc2 = rb_enc_str_coderange(str2);
2220  if (rc1 == ENC_CODERANGE_7BIT) {
2221  if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
2223  return TRUE;
2224  }
2225  if (rc2 == ENC_CODERANGE_7BIT) {
2227  return TRUE;
2228  }
2229  return FALSE;
2230 }
2231 
2232 int
2234 {
2235  long len1, len2;
2236  const char *ptr1, *ptr2;
2237  int retval;
2238 
2239  if (str1 == str2) return 0;
2240  RSTRING_GETMEM(str1, ptr1, len1);
2241  RSTRING_GETMEM(str2, ptr2, len2);
2242  if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
2243  if (len1 == len2) {
2244  if (!rb_str_comparable(str1, str2)) {
2245  if (ENCODING_GET(str1) > ENCODING_GET(str2))
2246  return 1;
2247  return -1;
2248  }
2249  return 0;
2250  }
2251  if (len1 > len2) return 1;
2252  return -1;
2253  }
2254  if (retval > 0) return 1;
2255  return -1;
2256 }
2257 
2258 /* expect tail call optimization */
2259 static VALUE
2260 str_eql(const VALUE str1, const VALUE str2)
2261 {
2262  const long len = RSTRING_LEN(str1);
2263  const char *ptr1, *ptr2;
2264 
2265  if (len != RSTRING_LEN(str2)) return Qfalse;
2266  if (!rb_str_comparable(str1, str2)) return Qfalse;
2267  if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
2268  return Qtrue;
2269  if (memcmp(ptr1, ptr2, len) == 0)
2270  return Qtrue;
2271  return Qfalse;
2272 }
2273 /*
2274  * call-seq:
2275  * str == obj -> true or false
2276  *
2277  * Equality---If <i>obj</i> is not a <code>String</code>, returns
2278  * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
2279  * <code><=></code> <i>obj</i> returns zero.
2280  */
2281 
2282 VALUE
2284 {
2285  if (str1 == str2) return Qtrue;
2286  if (TYPE(str2) != T_STRING) {
2287  if (!rb_respond_to(str2, rb_intern("to_str"))) {
2288  return Qfalse;
2289  }
2290  return rb_equal(str2, str1);
2291  }
2292  return str_eql(str1, str2);
2293 }
2294 
2295 /*
2296  * call-seq:
2297  * str.eql?(other) -> true or false
2298  *
2299  * Two strings are equal if they have the same length and content.
2300  */
2301 
2302 static VALUE
2304 {
2305  if (str1 == str2) return Qtrue;
2306  if (TYPE(str2) != T_STRING) return Qfalse;
2307  return str_eql(str1, str2);
2308 }
2309 
2310 /*
2311  * call-seq:
2312  * str <=> other_str -> -1, 0, +1 or nil
2313  *
2314  * Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if
2315  * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than
2316  * <i>str</i>. If the strings are of different lengths, and the strings are
2317  * equal when compared up to the shortest length, then the longer string is
2318  * considered greater than the shorter one. In older versions of Ruby, setting
2319  * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
2320  * in favor of using <code>String#casecmp</code>.
2321  *
2322  * <code><=></code> is the basis for the methods <code><</code>,
2323  * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
2324  * included from module <code>Comparable</code>. The method
2325  * <code>String#==</code> does not use <code>Comparable#==</code>.
2326  *
2327  * "abcdef" <=> "abcde" #=> 1
2328  * "abcdef" <=> "abcdef" #=> 0
2329  * "abcdef" <=> "abcdefg" #=> -1
2330  * "abcdef" <=> "ABCDEF" #=> 1
2331  */
2332 
2333 static VALUE
2335 {
2336  long result;
2337 
2338  if (TYPE(str2) != T_STRING) {
2339  if (!rb_respond_to(str2, rb_intern("to_str"))) {
2340  return Qnil;
2341  }
2342  else if (!rb_respond_to(str2, rb_intern("<=>"))) {
2343  return Qnil;
2344  }
2345  else {
2346  VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
2347 
2348  if (NIL_P(tmp)) return Qnil;
2349  if (!FIXNUM_P(tmp)) {
2350  return rb_funcall(LONG2FIX(0), '-', 1, tmp);
2351  }
2352  result = -FIX2LONG(tmp);
2353  }
2354  }
2355  else {
2356  result = rb_str_cmp(str1, str2);
2357  }
2358  return LONG2NUM(result);
2359 }
2360 
2361 /*
2362  * call-seq:
2363  * str.casecmp(other_str) -> -1, 0, +1 or nil
2364  *
2365  * Case-insensitive version of <code>String#<=></code>.
2366  *
2367  * "abcdef".casecmp("abcde") #=> 1
2368  * "aBcDeF".casecmp("abcdef") #=> 0
2369  * "abcdef".casecmp("abcdefg") #=> -1
2370  * "abcdef".casecmp("ABCDEF") #=> 0
2371  */
2372 
2373 static VALUE
2375 {
2376  long len;
2377  rb_encoding *enc;
2378  char *p1, *p1end, *p2, *p2end;
2379 
2380  StringValue(str2);
2381  enc = rb_enc_compatible(str1, str2);
2382  if (!enc) {
2383  return Qnil;
2384  }
2385 
2386  p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2387  p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2388  if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
2389  while (p1 < p1end && p2 < p2end) {
2390  if (*p1 != *p2) {
2391  unsigned int c1 = TOUPPER(*p1 & 0xff);
2392  unsigned int c2 = TOUPPER(*p2 & 0xff);
2393  if (c1 != c2)
2394  return INT2FIX(c1 < c2 ? -1 : 1);
2395  }
2396  p1++;
2397  p2++;
2398  }
2399  }
2400  else {
2401  while (p1 < p1end && p2 < p2end) {
2402  int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
2403  int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
2404 
2405  if (0 <= c1 && 0 <= c2) {
2406  c1 = TOUPPER(c1);
2407  c2 = TOUPPER(c2);
2408  if (c1 != c2)
2409  return INT2FIX(c1 < c2 ? -1 : 1);
2410  }
2411  else {
2412  int r;
2413  l1 = rb_enc_mbclen(p1, p1end, enc);
2414  l2 = rb_enc_mbclen(p2, p2end, enc);
2415  len = l1 < l2 ? l1 : l2;
2416  r = memcmp(p1, p2, len);
2417  if (r != 0)
2418  return INT2FIX(r < 0 ? -1 : 1);
2419  if (l1 != l2)
2420  return INT2FIX(l1 < l2 ? -1 : 1);
2421  }
2422  p1 += l1;
2423  p2 += l2;
2424  }
2425  }
2426  if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2427  if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2428  return INT2FIX(-1);
2429 }
2430 
2431 static long
2432 rb_str_index(VALUE str, VALUE sub, long offset)
2433 {
2434  long pos;
2435  char *s, *sptr, *e;
2436  long len, slen;
2437  rb_encoding *enc;
2438 
2439  enc = rb_enc_check(str, sub);
2440  if (is_broken_string(sub)) {
2441  return -1;
2442  }
2443  len = str_strlen(str, enc);
2444  slen = str_strlen(sub, enc);
2445  if (offset < 0) {
2446  offset += len;
2447  if (offset < 0) return -1;
2448  }
2449  if (len - offset < slen) return -1;
2450  s = RSTRING_PTR(str);
2451  e = s + RSTRING_LEN(str);
2452  if (offset) {
2453  offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2454  s += offset;
2455  }
2456  if (slen == 0) return offset;
2457  /* need proceed one character at a time */
2458  sptr = RSTRING_PTR(sub);
2459  slen = RSTRING_LEN(sub);
2460  len = RSTRING_LEN(str) - offset;
2461  for (;;) {
2462  char *t;
2463  pos = rb_memsearch(sptr, slen, s, len, enc);
2464  if (pos < 0) return pos;
2465  t = rb_enc_right_char_head(s, s+pos, e, enc);
2466  if (t == s + pos) break;
2467  if ((len -= t - s) <= 0) return -1;
2468  offset += t - s;
2469  s = t;
2470  }
2471  return pos + offset;
2472 }
2473 
2474 
2475 /*
2476  * call-seq:
2477  * str.index(substring [, offset]) -> fixnum or nil
2478  * str.index(regexp [, offset]) -> fixnum or nil
2479  *
2480  * Returns the index of the first occurrence of the given <i>substring</i> or
2481  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2482  * found. If the second parameter is present, it specifies the position in the
2483  * string to begin the search.
2484  *
2485  * "hello".index('e') #=> 1
2486  * "hello".index('lo') #=> 3
2487  * "hello".index('a') #=> nil
2488  * "hello".index(?e) #=> 1
2489  * "hello".index(/[aeiou]/, -3) #=> 4
2490  */
2491 
2492 static VALUE
2494 {
2495  VALUE sub;
2496  VALUE initpos;
2497  long pos;
2498 
2499  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2500  pos = NUM2LONG(initpos);
2501  }
2502  else {
2503  pos = 0;
2504  }
2505  if (pos < 0) {
2506  pos += str_strlen(str, STR_ENC_GET(str));
2507  if (pos < 0) {
2508  if (TYPE(sub) == T_REGEXP) {
2510  }
2511  return Qnil;
2512  }
2513  }
2514 
2515  switch (TYPE(sub)) {
2516  case T_REGEXP:
2517  if (pos > str_strlen(str, STR_ENC_GET(str)))
2518  return Qnil;
2519  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2520  rb_enc_check(str, sub), single_byte_optimizable(str));
2521 
2522  pos = rb_reg_search(sub, str, pos, 0);
2523  pos = rb_str_sublen(str, pos);
2524  break;
2525 
2526  default: {
2527  VALUE tmp;
2528 
2529  tmp = rb_check_string_type(sub);
2530  if (NIL_P(tmp)) {
2531  rb_raise(rb_eTypeError, "type mismatch: %s given",
2532  rb_obj_classname(sub));
2533  }
2534  sub = tmp;
2535  }
2536  /* fall through */
2537  case T_STRING:
2538  pos = rb_str_index(str, sub, pos);
2539  pos = rb_str_sublen(str, pos);
2540  break;
2541  }
2542 
2543  if (pos == -1) return Qnil;
2544  return LONG2NUM(pos);
2545 }
2546 
2547 static long
2548 rb_str_rindex(VALUE str, VALUE sub, long pos)
2549 {
2550  long len, slen;
2551  char *s, *sbeg, *e, *t;
2552  rb_encoding *enc;
2553  int singlebyte = single_byte_optimizable(str);
2554 
2555  enc = rb_enc_check(str, sub);
2556  if (is_broken_string(sub)) {
2557  return -1;
2558  }
2559  len = str_strlen(str, enc);
2560  slen = str_strlen(sub, enc);
2561  /* substring longer than string */
2562  if (len < slen) return -1;
2563  if (len - pos < slen) {
2564  pos = len - slen;
2565  }
2566  if (len == 0) {
2567  return pos;
2568  }
2569  sbeg = RSTRING_PTR(str);
2570  e = RSTRING_END(str);
2571  t = RSTRING_PTR(sub);
2572  slen = RSTRING_LEN(sub);
2573  s = str_nth(sbeg, e, pos, enc, singlebyte);
2574  while (s) {
2575  if (memcmp(s, t, slen) == 0) {
2576  return pos;
2577  }
2578  if (pos == 0) break;
2579  pos--;
2580  s = rb_enc_prev_char(sbeg, s, e, enc);
2581  }
2582  return -1;
2583 }
2584 
2585 
2586 /*
2587  * call-seq:
2588  * str.rindex(substring [, fixnum]) -> fixnum or nil
2589  * str.rindex(regexp [, fixnum]) -> fixnum or nil
2590  *
2591  * Returns the index of the last occurrence of the given <i>substring</i> or
2592  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2593  * found. If the second parameter is present, it specifies the position in the
2594  * string to end the search---characters beyond this point will not be
2595  * considered.
2596  *
2597  * "hello".rindex('e') #=> 1
2598  * "hello".rindex('l') #=> 3
2599  * "hello".rindex('a') #=> nil
2600  * "hello".rindex(?e) #=> 1
2601  * "hello".rindex(/[aeiou]/, -2) #=> 1
2602  */
2603 
2604 static VALUE
2606 {
2607  VALUE sub;
2608  VALUE vpos;
2609  rb_encoding *enc = STR_ENC_GET(str);
2610  long pos, len = str_strlen(str, enc);
2611 
2612  if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2613  pos = NUM2LONG(vpos);
2614  if (pos < 0) {
2615  pos += len;
2616  if (pos < 0) {
2617  if (TYPE(sub) == T_REGEXP) {
2619  }
2620  return Qnil;
2621  }
2622  }
2623  if (pos > len) pos = len;
2624  }
2625  else {
2626  pos = len;
2627  }
2628 
2629  switch (TYPE(sub)) {
2630  case T_REGEXP:
2631  /* enc = rb_get_check(str, sub); */
2632  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2634 
2635  if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2636  pos = rb_reg_search(sub, str, pos, 1);
2637  pos = rb_str_sublen(str, pos);
2638  }
2639  if (pos >= 0) return LONG2NUM(pos);
2640  break;
2641 
2642  default: {
2643  VALUE tmp;
2644 
2645  tmp = rb_check_string_type(sub);
2646  if (NIL_P(tmp)) {
2647  rb_raise(rb_eTypeError, "type mismatch: %s given",
2648  rb_obj_classname(sub));
2649  }
2650  sub = tmp;
2651  }
2652  /* fall through */
2653  case T_STRING:
2654  pos = rb_str_rindex(str, sub, pos);
2655  if (pos >= 0) return LONG2NUM(pos);
2656  break;
2657  }
2658  return Qnil;
2659 }
2660 
2661 /*
2662  * call-seq:
2663  * str =~ obj -> fixnum or nil
2664  *
2665  * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2666  * against <i>str</i>,and returns the position the match starts, or
2667  * <code>nil</code> if there is no match. Otherwise, invokes
2668  * <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2669  * <code>=~</code> in <code>Object</code> returns <code>nil</code>.
2670  *
2671  * "cat o' 9 tails" =~ /\d/ #=> 7
2672  * "cat o' 9 tails" =~ 9 #=> nil
2673  */
2674 
2675 static VALUE
2677 {
2678  switch (TYPE(y)) {
2679  case T_STRING:
2680  rb_raise(rb_eTypeError, "type mismatch: String given");
2681 
2682  case T_REGEXP:
2683  return rb_reg_match(y, x);
2684 
2685  default:
2686  return rb_funcall(y, rb_intern("=~"), 1, x);
2687  }
2688 }
2689 
2690 
2691 static VALUE get_pat(VALUE, int);
2692 
2693 
2694 /*
2695  * call-seq:
2696  * str.match(pattern) -> matchdata or nil
2697  * str.match(pattern, pos) -> matchdata or nil
2698  *
2699  * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2700  * then invokes its <code>match</code> method on <i>str</i>. If the second
2701  * parameter is present, it specifies the position in the string to begin the
2702  * search.
2703  *
2704  * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l">
2705  * 'hello'.match('(.)\1')[0] #=> "ll"
2706  * 'hello'.match(/(.)\1/)[0] #=> "ll"
2707  * 'hello'.match('xx') #=> nil
2708  *
2709  * If a block is given, invoke the block with MatchData if match succeed, so
2710  * that you can write
2711  *
2712  * str.match(pat) {|m| ...}
2713  *
2714  * instead of
2715  *
2716  * if m = str.match(pat)
2717  * ...
2718  * end
2719  *
2720  * The return value is a value from block execution in this case.
2721  */
2722 
2723 static VALUE
2725 {
2726  VALUE re, result;
2727  if (argc < 1)
2728  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
2729  re = argv[0];
2730  argv[0] = str;
2731  result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2732  if (!NIL_P(result) && rb_block_given_p()) {
2733  return rb_yield(result);
2734  }
2735  return result;
2736 }
2737 
2742 };
2743 
2744 static enum neighbor_char
2745 enc_succ_char(char *p, long len, rb_encoding *enc)
2746 {
2747  long i;
2748  int l;
2749  while (1) {
2750  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2751  p[i] = '\0';
2752  if (i < 0)
2753  return NEIGHBOR_WRAPPED;
2754  ++((unsigned char*)p)[i];
2755  l = rb_enc_precise_mbclen(p, p+len, enc);
2756  if (MBCLEN_CHARFOUND_P(l)) {
2757  l = MBCLEN_CHARFOUND_LEN(l);
2758  if (l == len) {
2759  return NEIGHBOR_FOUND;
2760  }
2761  else {
2762  memset(p+l, 0xff, len-l);
2763  }
2764  }
2765  if (MBCLEN_INVALID_P(l) && i < len-1) {
2766  long len2;
2767  int l2;
2768  for (len2 = len-1; 0 < len2; len2--) {
2769  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2770  if (!MBCLEN_INVALID_P(l2))
2771  break;
2772  }
2773  memset(p+len2+1, 0xff, len-(len2+1));
2774  }
2775  }
2776 }
2777 
2778 static enum neighbor_char
2779 enc_pred_char(char *p, long len, rb_encoding *enc)
2780 {
2781  long i;
2782  int l;
2783  while (1) {
2784  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2785  p[i] = '\xff';
2786  if (i < 0)
2787  return NEIGHBOR_WRAPPED;
2788  --((unsigned char*)p)[i];
2789  l = rb_enc_precise_mbclen(p, p+len, enc);
2790  if (MBCLEN_CHARFOUND_P(l)) {
2791  l = MBCLEN_CHARFOUND_LEN(l);
2792  if (l == len) {
2793  return NEIGHBOR_FOUND;
2794  }
2795  else {
2796  memset(p+l, 0, len-l);
2797  }
2798  }
2799  if (MBCLEN_INVALID_P(l) && i < len-1) {
2800  long len2;
2801  int l2;
2802  for (len2 = len-1; 0 < len2; len2--) {
2803  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2804  if (!MBCLEN_INVALID_P(l2))
2805  break;
2806  }
2807  memset(p+len2+1, 0, len-(len2+1));
2808  }
2809  }
2810 }
2811 
2812 /*
2813  overwrite +p+ by succeeding letter in +enc+ and returns
2814  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2815  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2816  assuming each ranges are successive, and mbclen
2817  never change in each ranges.
2818  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2819  character.
2820  */
2821 static enum neighbor_char
2822 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
2823 {
2824  enum neighbor_char ret;
2825  unsigned int c;
2826  int ctype;
2827  int range;
2828  char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2829 
2830  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2831  if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2832  ctype = ONIGENC_CTYPE_DIGIT;
2833  else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2834  ctype = ONIGENC_CTYPE_ALPHA;
2835  else
2836  return NEIGHBOR_NOT_CHAR;
2837 
2838  MEMCPY(save, p, char, len);
2839  ret = enc_succ_char(p, len, enc);
2840  if (ret == NEIGHBOR_FOUND) {
2841  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2842  if (rb_enc_isctype(c, ctype, enc))
2843  return NEIGHBOR_FOUND;
2844  }
2845  MEMCPY(p, save, char, len);
2846  range = 1;
2847  while (1) {
2848  MEMCPY(save, p, char, len);
2849  ret = enc_pred_char(p, len, enc);
2850  if (ret == NEIGHBOR_FOUND) {
2851  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2852  if (!rb_enc_isctype(c, ctype, enc)) {
2853  MEMCPY(p, save, char, len);
2854  break;
2855  }
2856  }
2857  else {
2858  MEMCPY(p, save, char, len);
2859  break;
2860  }
2861  range++;
2862  }
2863  if (range == 1) {
2864  return NEIGHBOR_NOT_CHAR;
2865  }
2866 
2867  if (ctype != ONIGENC_CTYPE_DIGIT) {
2868  MEMCPY(carry, p, char, len);
2869  return NEIGHBOR_WRAPPED;
2870  }
2871 
2872  MEMCPY(carry, p, char, len);
2873  enc_succ_char(carry, len, enc);
2874  return NEIGHBOR_WRAPPED;
2875 }
2876 
2877 
2878 /*
2879  * call-seq:
2880  * str.succ -> new_str
2881  * str.next -> new_str
2882  *
2883  * Returns the successor to <i>str</i>. The successor is calculated by
2884  * incrementing characters starting from the rightmost alphanumeric (or
2885  * the rightmost character if there are no alphanumerics) in the
2886  * string. Incrementing a digit always results in another digit, and
2887  * incrementing a letter results in another letter of the same case.
2888  * Incrementing nonalphanumerics uses the underlying character set's
2889  * collating sequence.
2890  *
2891  * If the increment generates a ``carry,'' the character to the left of
2892  * it is incremented. This process repeats until there is no carry,
2893  * adding an additional character if necessary.
2894  *
2895  * "abcd".succ #=> "abce"
2896  * "THX1138".succ #=> "THX1139"
2897  * "<<koala>>".succ #=> "<<koalb>>"
2898  * "1999zzz".succ #=> "2000aaa"
2899  * "ZZZ9999".succ #=> "AAAA0000"
2900  * "***".succ #=> "**+"
2901  */
2902 
2903 VALUE
2905 {
2906  rb_encoding *enc;
2907  VALUE str;
2908  char *sbeg, *s, *e, *last_alnum = 0;
2909  int c = -1;
2910  long l;
2911  char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2912  long carry_pos = 0, carry_len = 1;
2913  enum neighbor_char neighbor = NEIGHBOR_FOUND;
2914 
2915  str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2916  rb_enc_cr_str_copy_for_substr(str, orig);
2917  OBJ_INFECT(str, orig);
2918  if (RSTRING_LEN(str) == 0) return str;
2919 
2920  enc = STR_ENC_GET(orig);
2921  sbeg = RSTRING_PTR(str);
2922  s = e = sbeg + RSTRING_LEN(str);
2923 
2924  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
2925  if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
2926  if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
2927  ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
2928  s = last_alnum;
2929  break;
2930  }
2931  }
2932  if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2933  neighbor = enc_succ_alnum_char(s, l, enc, carry);
2934  switch (neighbor) {
2935  case NEIGHBOR_NOT_CHAR:
2936  continue;
2937  case NEIGHBOR_FOUND:
2938  return str;
2939  case NEIGHBOR_WRAPPED:
2940  last_alnum = s;
2941  break;
2942  }
2943  c = 1;
2944  carry_pos = s - sbeg;
2945  carry_len = l;
2946  }
2947  if (c == -1) { /* str contains no alnum */
2948  s = e;
2949  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
2950  enum neighbor_char neighbor;
2951  if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2952  neighbor = enc_succ_char(s, l, enc);
2953  if (neighbor == NEIGHBOR_FOUND)
2954  return str;
2955  if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
2956  /* wrapped to \0...\0. search next valid char. */
2957  enc_succ_char(s, l, enc);
2958  }
2959  if (!rb_enc_asciicompat(enc)) {
2960  MEMCPY(carry, s, char, l);
2961  carry_len = l;
2962  }
2963  carry_pos = s - sbeg;
2964  }
2965  }
2966  RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
2967  s = RSTRING_PTR(str) + carry_pos;
2968  memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
2969  memmove(s, carry, carry_len);
2970  STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
2971  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2972  rb_enc_str_coderange(str);
2973  return str;
2974 }
2975 
2976 
2977 /*
2978  * call-seq:
2979  * str.succ! -> str
2980  * str.next! -> str
2981  *
2982  * Equivalent to <code>String#succ</code>, but modifies the receiver in
2983  * place.
2984  */
2985 
2986 static VALUE
2988 {
2990 
2991  return str;
2992 }
2993 
2994 
2995 /*
2996  * call-seq:
2997  * str.upto(other_str, exclusive=false) {|s| block } -> str
2998  * str.upto(other_str, exclusive=false) -> an_enumerator
2999  *
3000  * Iterates through successive values, starting at <i>str</i> and
3001  * ending at <i>other_str</i> inclusive, passing each value in turn to
3002  * the block. The <code>String#succ</code> method is used to generate
3003  * each value. If optional second argument exclusive is omitted or is false,
3004  * the last value will be included; otherwise it will be excluded.
3005  *
3006  * If no block is given, an enumerator is returned instead.
3007  *
3008  * "a8".upto("b6") {|s| print s, ' ' }
3009  * for s in "a8".."b6"
3010  * print s, ' '
3011  * end
3012  *
3013  * <em>produces:</em>
3014  *
3015  * a8 a9 b0 b1 b2 b3 b4 b5 b6
3016  * a8 a9 b0 b1 b2 b3 b4 b5 b6
3017  *
3018  * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
3019  * both are recognized as decimal numbers. In addition, the width of
3020  * string (e.g. leading zeros) is handled appropriately.
3021  *
3022  * "9".upto("11").to_a #=> ["9", "10", "11"]
3023  * "25".upto("5").to_a #=> []
3024  * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"]
3025  */
3026 
3027 static VALUE
3029 {
3030  VALUE end, exclusive;
3031  VALUE current, after_end;
3032  ID succ;
3033  int n, excl, ascii;
3034  rb_encoding *enc;
3035 
3036  rb_scan_args(argc, argv, "11", &end, &exclusive);
3037  RETURN_ENUMERATOR(beg, argc, argv);
3038  excl = RTEST(exclusive);
3039  CONST_ID(succ, "succ");
3040  StringValue(end);
3041  enc = rb_enc_check(beg, end);
3042  ascii = (is_ascii_string(beg) && is_ascii_string(end));
3043  /* single character */
3044  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
3045  char c = RSTRING_PTR(beg)[0];
3046  char e = RSTRING_PTR(end)[0];
3047 
3048  if (c > e || (excl && c == e)) return beg;
3049  for (;;) {
3050  rb_yield(rb_enc_str_new(&c, 1, enc));
3051  if (!excl && c == e) break;
3052  c++;
3053  if (excl && c == e) break;
3054  }
3055  return beg;
3056  }
3057  /* both edges are all digits */
3058  if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
3059  char *s, *send;
3060  VALUE b, e;
3061  int width;
3062 
3063  s = RSTRING_PTR(beg); send = RSTRING_END(beg);
3064  width = rb_long2int(send - s);
3065  while (s < send) {
3066  if (!ISDIGIT(*s)) goto no_digits;
3067  s++;
3068  }
3069  s = RSTRING_PTR(end); send = RSTRING_END(end);
3070  while (s < send) {
3071  if (!ISDIGIT(*s)) goto no_digits;
3072  s++;
3073  }
3074  b = rb_str_to_inum(beg, 10, FALSE);
3075  e = rb_str_to_inum(end, 10, FALSE);
3076  if (FIXNUM_P(b) && FIXNUM_P(e)) {
3077  long bi = FIX2LONG(b);
3078  long ei = FIX2LONG(e);
3079  rb_encoding *usascii = rb_usascii_encoding();
3080 
3081  while (bi <= ei) {
3082  if (excl && bi == ei) break;
3083  rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
3084  bi++;
3085  }
3086  }
3087  else {
3088  ID op = excl ? '<' : rb_intern("<=");
3089  VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
3090 
3091  args[0] = INT2FIX(width);
3092  while (rb_funcall(b, op, 1, e)) {
3093  args[1] = b;
3094  rb_yield(rb_str_format(numberof(args), args, fmt));
3095  b = rb_funcall(b, succ, 0, 0);
3096  }
3097  }
3098  return beg;
3099  }
3100  /* normal case */
3101  no_digits:
3102  n = rb_str_cmp(beg, end);
3103  if (n > 0 || (excl && n == 0)) return beg;
3104 
3105  after_end = rb_funcall(end, succ, 0, 0);
3106  current = rb_str_dup(beg);
3107  while (!rb_str_equal(current, after_end)) {
3108  VALUE next = Qnil;
3109  if (excl || !rb_str_equal(current, end))
3110  next = rb_funcall(current, succ, 0, 0);
3111  rb_yield(current);
3112  if (NIL_P(next)) break;
3113  current = next;
3114  StringValue(current);
3115  if (excl && rb_str_equal(current, end)) break;
3116  if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
3117  break;
3118  }
3119 
3120  return beg;
3121 }
3122 
3123 static VALUE
3124 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
3125 {
3126  if (rb_reg_search(re, str, 0, 0) >= 0) {
3128  int nth = rb_reg_backref_number(match, backref);
3129  return rb_reg_nth_match(nth, match);
3130  }
3131  return Qnil;
3132 }
3133 
3134 static VALUE
3136 {
3137  long idx;
3138 
3139  switch (TYPE(indx)) {
3140  case T_FIXNUM:
3141  idx = FIX2LONG(indx);
3142 
3143  num_index:
3144  str = rb_str_substr(str, idx, 1);
3145  if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
3146  return str;
3147 
3148  case T_REGEXP:
3149  return rb_str_subpat(str, indx, INT2FIX(0));
3150 
3151  case T_STRING:
3152  if (rb_str_index(str, indx, 0) != -1)
3153  return rb_str_dup(indx);
3154  return Qnil;
3155 
3156  default:
3157  /* check if indx is Range */
3158  {
3159  long beg, len;
3160  VALUE tmp;
3161 
3162  len = str_strlen(str, STR_ENC_GET(str));
3163  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
3164  case Qfalse:
3165  break;
3166  case Qnil:
3167  return Qnil;
3168  default:
3169  tmp = rb_str_substr(str, beg, len);
3170  return tmp;
3171  }
3172  }
3173  idx = NUM2LONG(indx);
3174  goto num_index;
3175  }
3176  return Qnil; /* not reached */
3177 }
3178 
3179 
3180 /*
3181  * call-seq:
3182  * str[fixnum] -> new_str or nil
3183  * str[fixnum, fixnum] -> new_str or nil
3184  * str[range] -> new_str or nil
3185  * str[regexp] -> new_str or nil
3186  * str[regexp, fixnum] -> new_str or nil
3187  * str[other_str] -> new_str or nil
3188  * str.slice(fixnum) -> new_str or nil
3189  * str.slice(fixnum, fixnum) -> new_str or nil
3190  * str.slice(range) -> new_str or nil
3191  * str.slice(regexp) -> new_str or nil
3192  * str.slice(regexp, fixnum) -> new_str or nil
3193  * str.slice(regexp, capname) -> new_str or nil
3194  * str.slice(other_str) -> new_str or nil
3195  *
3196  * Element Reference---If passed a single <code>Fixnum</code>, returns a
3197  * substring of one character at that position. If passed two <code>Fixnum</code>
3198  * objects, returns a substring starting at the offset given by the first, and
3199  * with a length given by the second. If passed a range, its beginning and end
3200  * are interpreted as offsets delimiting the substring to be returned. In all
3201  * three cases, if an offset is negative, it is counted from the end of <i>str</i>.
3202  * Returns <code>nil</code> if the initial offset falls outside the string or
3203  * the length is negative.
3204  *
3205  * If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
3206  * returned. If a numeric or name parameter follows the regular expression, that
3207  * component of the <code>MatchData</code> is returned instead. If a
3208  * <code>String</code> is given, that string is returned if it occurs in
3209  * <i>str</i>. In both cases, <code>nil</code> is returned if there is no
3210  * match.
3211  *
3212  * a = "hello there"
3213  * a[1] #=> "e"
3214  * a[2, 3] #=> "llo"
3215  * a[2..3] #=> "ll"
3216  * a[-3, 2] #=> "er"
3217  * a[7..-2] #=> "her"
3218  * a[-4..-2] #=> "her"
3219  * a[-2..-4] #=> ""
3220  * a[12..-1] #=> nil
3221  * a[/[aeiou](.)\1/] #=> "ell"
3222  * a[/[aeiou](.)\1/, 0] #=> "ell"
3223  * a[/[aeiou](.)\1/, 1] #=> "l"
3224  * a[/[aeiou](.)\1/, 2] #=> nil
3225  * a["lo"] #=> "lo"
3226  * a["bye"] #=> nil
3227  */
3228 
3229 static VALUE
3231 {
3232  if (argc == 2) {
3233  if (TYPE(argv[0]) == T_REGEXP) {
3234  return rb_str_subpat(str, argv[0], argv[1]);
3235  }
3236  return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
3237  }
3238  if (argc != 1) {
3239  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3240  }
3241  return rb_str_aref(str, argv[0]);
3242 }
3243 
3244 VALUE
3245 rb_str_drop_bytes(VALUE str, long len)
3246 {
3247  char *ptr = RSTRING_PTR(str);
3248  long olen = RSTRING_LEN(str), nlen;
3249 
3250  str_modifiable(str);
3251  if (len > olen) len = olen;
3252  nlen = olen - len;
3253  if (nlen <= RSTRING_EMBED_LEN_MAX) {
3254  char *oldptr = ptr;
3255  int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
3256  STR_SET_EMBED(str);
3257  STR_SET_EMBED_LEN(str, nlen);
3258  ptr = RSTRING(str)->as.ary;
3259  memmove(ptr, oldptr + len, nlen);
3260  if (fl == STR_NOEMBED) xfree(oldptr);
3261  }
3262  else {
3263  if (!STR_SHARED_P(str)) rb_str_new4(str);
3264  ptr = RSTRING(str)->as.heap.ptr += len;
3265  RSTRING(str)->as.heap.len = nlen;
3266  }
3267  ptr[nlen] = 0;
3268  ENC_CODERANGE_CLEAR(str);
3269  return str;
3270 }
3271 
3272 static void
3273 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
3274 {
3275  if (beg == 0 && RSTRING_LEN(val) == 0) {
3276  rb_str_drop_bytes(str, len);
3277  OBJ_INFECT(str, val);
3278  return;
3279  }
3280 
3281  rb_str_modify(str);
3282  if (len < RSTRING_LEN(val)) {
3283  /* expand string */
3284  RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
3285  }
3286 
3287  if (RSTRING_LEN(val) != len) {
3288  memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
3289  RSTRING_PTR(str) + beg + len,
3290  RSTRING_LEN(str) - (beg + len));
3291  }
3292  if (RSTRING_LEN(val) < beg && len < 0) {
3293  MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
3294  }
3295  if (RSTRING_LEN(val) > 0) {
3296  memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
3297  }
3298  STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
3299  if (RSTRING_PTR(str)) {
3300  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3301  }
3302  OBJ_INFECT(str, val);
3303 }
3304 
3305 static void
3306 rb_str_splice(VALUE str, long beg, long len, VALUE val)
3307 {
3308  long slen;
3309  char *p, *e;
3310  rb_encoding *enc;
3311  int singlebyte = single_byte_optimizable(str);
3312  int cr;
3313 
3314  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
3315 
3316  StringValue(val);
3317  enc = rb_enc_check(str, val);
3318  slen = str_strlen(str, enc);
3319 
3320  if (slen < beg) {
3321  out_of_range:
3322  rb_raise(rb_eIndexError, "index %ld out of string", beg);
3323  }
3324  if (beg < 0) {
3325  if (-beg > slen) {
3326  goto out_of_range;
3327  }
3328  beg += slen;
3329  }
3330  if (slen < len || slen < beg + len) {
3331  len = slen - beg;
3332  }
3333  str_modify_keep_cr(str);
3334  p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
3335  if (!p) p = RSTRING_END(str);
3336  e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
3337  if (!e) e = RSTRING_END(str);
3338  /* error check */
3339  beg = p - RSTRING_PTR(str); /* physical position */
3340  len = e - p; /* physical length */
3341  rb_str_splice_0(str, beg, len, val);
3342  rb_enc_associate(str, enc);
3344  if (cr != ENC_CODERANGE_BROKEN)
3345  ENC_CODERANGE_SET(str, cr);
3346 }
3347 
3348 void
3349 rb_str_update(VALUE str, long beg, long len, VALUE val)
3350 {
3351  rb_str_splice(str, beg, len, val);
3352 }
3353 
3354 static void
3355 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
3356 {
3357  int nth;
3358  VALUE match;
3359  long start, end, len;
3360  rb_encoding *enc;
3361  struct re_registers *regs;
3362 
3363  if (rb_reg_search(re, str, 0, 0) < 0) {
3364  rb_raise(rb_eIndexError, "regexp not matched");
3365  }
3366  match = rb_backref_get();
3367  nth = rb_reg_backref_number(match, backref);
3368  regs = RMATCH_REGS(match);
3369  if (nth >= regs->num_regs) {
3370  out_of_range:
3371  rb_raise(rb_eIndexError, "index %d out of regexp", nth);
3372  }
3373  if (nth < 0) {
3374  if (-nth >= regs->num_regs) {
3375  goto out_of_range;
3376  }
3377  nth += regs->num_regs;
3378  }
3379 
3380  start = BEG(nth);
3381  if (start == -1) {
3382  rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
3383  }
3384  end = END(nth);
3385  len = end - start;
3386  StringValue(val);
3387  enc = rb_enc_check(str, val);
3388  rb_str_splice_0(str, start, len, val);
3389  rb_enc_associate(str, enc);
3390 }
3391 
3392 static VALUE
3393 rb_str_aset(VALUE str, VALUE indx, VALUE val)
3394 {
3395  long idx, beg;
3396 
3397  switch (TYPE(indx)) {
3398  case T_FIXNUM:
3399  idx = FIX2LONG(indx);
3400  num_index:
3401  rb_str_splice(str, idx, 1, val);
3402  return val;
3403 
3404  case T_REGEXP:
3405  rb_str_subpat_set(str, indx, INT2FIX(0), val);
3406  return val;
3407 
3408  case T_STRING:
3409  beg = rb_str_index(str, indx, 0);
3410  if (beg < 0) {
3411  rb_raise(rb_eIndexError, "string not matched");
3412  }
3413  beg = rb_str_sublen(str, beg);
3414  rb_str_splice(str, beg, str_strlen(indx, 0), val);
3415  return val;
3416 
3417  default:
3418  /* check if indx is Range */
3419  {
3420  long beg, len;
3421  if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
3422  rb_str_splice(str, beg, len, val);
3423  return val;
3424  }
3425  }
3426  idx = NUM2LONG(indx);
3427  goto num_index;
3428  }
3429 }
3430 
3431 /*
3432  * call-seq:
3433  * str[fixnum] = new_str
3434  * str[fixnum, fixnum] = new_str
3435  * str[range] = aString
3436  * str[regexp] = new_str
3437  * str[regexp, fixnum] = new_str
3438  * str[regexp, name] = new_str
3439  * str[other_str] = new_str
3440  *
3441  * Element Assignment---Replaces some or all of the content of <i>str</i>. The
3442  * portion of the string affected is determined using the same criteria as
3443  * <code>String#[]</code>. If the replacement string is not the same length as
3444  * the text it is replacing, the string will be adjusted accordingly. If the
3445  * regular expression or string is used as the index doesn't match a position
3446  * in the string, <code>IndexError</code> is raised. If the regular expression
3447  * form is used, the optional second <code>Fixnum</code> allows you to specify
3448  * which portion of the match to replace (effectively using the
3449  * <code>MatchData</code> indexing rules. The forms that take a
3450  * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
3451  * out of range; the <code>Range</code> form will raise a
3452  * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
3453  * forms will silently ignore the assignment.
3454  */
3455 
3456 static VALUE
3458 {
3459  if (argc == 3) {
3460  if (TYPE(argv[0]) == T_REGEXP) {
3461  rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
3462  }
3463  else {
3464  rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
3465  }
3466  return argv[2];
3467  }
3468  if (argc != 2) {
3469  rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
3470  }
3471  return rb_str_aset(str, argv[0], argv[1]);
3472 }
3473 
3474 /*
3475  * call-seq:
3476  * str.insert(index, other_str) -> str
3477  *
3478  * Inserts <i>other_str</i> before the character at the given
3479  * <i>index</i>, modifying <i>str</i>. Negative indices count from the
3480  * end of the string, and insert <em>after</em> the given character.
3481  * The intent is insert <i>aString</i> so that it starts at the given
3482  * <i>index</i>.
3483  *
3484  * "abcd".insert(0, 'X') #=> "Xabcd"
3485  * "abcd".insert(3, 'X') #=> "abcXd"
3486  * "abcd".insert(4, 'X') #=> "abcdX"
3487  * "abcd".insert(-3, 'X') #=> "abXcd"
3488  * "abcd".insert(-1, 'X') #=> "abcdX"
3489  */
3490 
3491 static VALUE
3493 {
3494  long pos = NUM2LONG(idx);
3495 
3496  if (pos == -1) {
3497  return rb_str_append(str, str2);
3498  }
3499  else if (pos < 0) {
3500  pos++;
3501  }
3502  rb_str_splice(str, pos, 0, str2);
3503  return str;
3504 }
3505 
3506 
3507 /*
3508  * call-seq:
3509  * str.slice!(fixnum) -> fixnum or nil
3510  * str.slice!(fixnum, fixnum) -> new_str or nil
3511  * str.slice!(range) -> new_str or nil
3512  * str.slice!(regexp) -> new_str or nil
3513  * str.slice!(other_str) -> new_str or nil
3514  *
3515  * Deletes the specified portion from <i>str</i>, and returns the portion
3516  * deleted.
3517  *
3518  * string = "this is a string"
3519  * string.slice!(2) #=> "i"
3520  * string.slice!(3..6) #=> " is "
3521  * string.slice!(/s.*t/) #=> "sa st"
3522  * string.slice!("r") #=> "r"
3523  * string #=> "thing"
3524  */
3525 
3526 static VALUE
3528 {
3529  VALUE result;
3530  VALUE buf[3];
3531  int i;
3532 
3533  if (argc < 1 || 2 < argc) {
3534  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3535  }
3536  for (i=0; i<argc; i++) {
3537  buf[i] = argv[i];
3538  }
3539  str_modify_keep_cr(str);
3540  result = rb_str_aref_m(argc, buf, str);
3541  if (!NIL_P(result)) {
3542  buf[i] = rb_str_new(0,0);
3543  rb_str_aset_m(argc+1, buf, str);
3544  }
3545  return result;
3546 }
3547 
3548 static VALUE
3549 get_pat(VALUE pat, int quote)
3550 {
3551  VALUE val;
3552 
3553  switch (TYPE(pat)) {
3554  case T_REGEXP:
3555  return pat;
3556 
3557  case T_STRING:
3558  break;
3559 
3560  default:
3561  val = rb_check_string_type(pat);
3562  if (NIL_P(val)) {
3563  Check_Type(pat, T_REGEXP);
3564  }
3565  pat = val;
3566  }
3567 
3568  if (quote) {
3569  pat = rb_reg_quote(pat);
3570  }
3571 
3572  return rb_reg_regcomp(pat);
3573 }
3574 
3575 
3576 /*
3577  * call-seq:
3578  * str.sub!(pattern, replacement) -> str or nil
3579  * str.sub!(pattern) {|match| block } -> str or nil
3580  *
3581  * Performs the substitutions of <code>String#sub</code> in place,
3582  * returning <i>str</i>, or <code>nil</code> if no substitutions were
3583  * performed.
3584  */
3585 
3586 static VALUE
3588 {
3589  VALUE pat, repl, hash = Qnil;
3590  int iter = 0;
3591  int tainted = 0;
3592  int untrusted = 0;
3593  long plen;
3594 
3595  if (argc == 1 && rb_block_given_p()) {
3596  iter = 1;
3597  }
3598  else if (argc == 2) {
3599  repl = argv[1];
3600  hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3601  if (NIL_P(hash)) {
3602  StringValue(repl);
3603  }
3604  if (OBJ_TAINTED(repl)) tainted = 1;
3605  if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3606  }
3607  else {
3608  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3609  }
3610 
3611  pat = get_pat(argv[0], 1);
3612  str_modifiable(str);
3613  if (rb_reg_search(pat, str, 0, 0) >= 0) {
3614  rb_encoding *enc;
3615  int cr = ENC_CODERANGE(str);
3617  struct re_registers *regs = RMATCH_REGS(match);
3618  long beg0 = BEG(0);
3619  long end0 = END(0);
3620  char *p, *rp;
3621  long len, rlen;
3622 
3623  if (iter || !NIL_P(hash)) {
3624  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3625 
3626  if (iter) {
3627  repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3628  }
3629  else {
3630  repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3631  repl = rb_obj_as_string(repl);
3632  }
3633  str_mod_check(str, p, len);
3634  rb_check_frozen(str);
3635  }
3636  else {
3637  repl = rb_reg_regsub(repl, str, regs, pat);
3638  }
3639  enc = rb_enc_compatible(str, repl);
3640  if (!enc) {
3641  rb_encoding *str_enc = STR_ENC_GET(str);
3642  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3643  if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
3644  coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
3645  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3646  rb_enc_name(str_enc),
3647  rb_enc_name(STR_ENC_GET(repl)));
3648  }
3649  enc = STR_ENC_GET(repl);
3650  }
3651  rb_str_modify(str);
3652  rb_enc_associate(str, enc);
3653  if (OBJ_TAINTED(repl)) tainted = 1;
3654  if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3655  if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3656  int cr2 = ENC_CODERANGE(repl);
3657  if (cr2 == ENC_CODERANGE_BROKEN ||
3658  (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
3659  cr = ENC_CODERANGE_UNKNOWN;
3660  else
3661  cr = cr2;
3662  }
3663  plen = end0 - beg0;
3664  rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
3665  len = RSTRING_LEN(str);
3666  if (rlen > plen) {
3667  RESIZE_CAPA(str, len + rlen - plen);
3668  }
3669  p = RSTRING_PTR(str);
3670  if (rlen != plen) {
3671  memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
3672  }
3673  memcpy(p + beg0, rp, rlen);
3674  len += rlen - plen;
3675  STR_SET_LEN(str, len);
3676  RSTRING_PTR(str)[len] = '\0';
3677  ENC_CODERANGE_SET(str, cr);
3678  if (tainted) OBJ_TAINT(str);
3679  if (untrusted) OBJ_UNTRUST(str);
3680 
3681  return str;
3682  }
3683  return Qnil;
3684 }
3685 
3686 
3687 /*
3688  * call-seq:
3689  * str.sub(pattern, replacement) -> new_str
3690  * str.sub(pattern, hash) -> new_str
3691  * str.sub(pattern) {|match| block } -> new_str
3692  *
3693  * Returns a copy of <i>str</i> with the <em>first</em> occurrence of
3694  * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
3695  * typically a <code>Regexp</code>; if given as a <code>String</code>, any
3696  * regular expression metacharacters it contains will be interpreted
3697  * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
3698  * instead of a digit.
3699  *
3700  * If <i>replacement</i> is a <code>String</code> it will be substituted for
3701  * the matched text. It may contain back-references to the pattern's capture
3702  * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
3703  * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
3704  * double-quoted string, both back-references must be preceded by an
3705  * additional backslash. However, within <i>replacement</i> the special match
3706  * variables, such as <code>&$</code>, will not refer to the current match.
3707  *
3708  * If the second argument is a <code>Hash</code>, and the matched text is one
3709  * of its keys, the corresponding value is the replacement string.
3710  *
3711  * In the block form, the current match string is passed in as a parameter,
3712  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3713  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3714  * returned by the block will be substituted for the match on each call.
3715  *
3716  * The result inherits any tainting in the original string or any supplied
3717  * replacement string.
3718  *
3719  * "hello".sub(/[aeiou]/, '*') #=> "h*llo"
3720  * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
3721  * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello"
3722  * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
3723  * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
3724  * #=> "Is /bin/bash your preferred shell?"
3725  */
3726 
3727 static VALUE
3729 {
3730  str = rb_str_dup(str);
3731  rb_str_sub_bang(argc, argv, str);
3732  return str;
3733 }
3734 
3735 static VALUE
3736 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3737 {
3738  VALUE pat, val, repl, match, dest, hash = Qnil;
3739  struct re_registers *regs;
3740  long beg, n;
3741  long beg0, end0;
3742  long offset, blen, slen, len, last;
3743  int iter = 0;
3744  char *sp, *cp;
3745  int tainted = 0;
3746  rb_encoding *str_enc;
3747 
3748  switch (argc) {
3749  case 1:
3750  RETURN_ENUMERATOR(str, argc, argv);
3751  iter = 1;
3752  break;
3753  case 2:
3754  repl = argv[1];
3755  hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3756  if (NIL_P(hash)) {
3757  StringValue(repl);
3758  }
3759  if (OBJ_TAINTED(repl)) tainted = 1;
3760  break;
3761  default:
3762  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3763  }
3764 
3765  pat = get_pat(argv[0], 1);
3766  beg = rb_reg_search(pat, str, 0, 0);
3767  if (beg < 0) {
3768  if (bang) return Qnil; /* no match, no substitution */
3769  return rb_str_dup(str);
3770  }
3771 
3772  offset = 0;
3773  n = 0;
3774  blen = RSTRING_LEN(str) + 30; /* len + margin */
3775  dest = rb_str_buf_new(blen);
3776  sp = RSTRING_PTR(str);
3777  slen = RSTRING_LEN(str);
3778  cp = sp;
3779  str_enc = STR_ENC_GET(str);
3780  rb_enc_associate(dest, str_enc);
3782 
3783  do {
3784  n++;
3785  match = rb_backref_get();
3786  regs = RMATCH_REGS(match);
3787  beg0 = BEG(0);
3788  end0 = END(0);
3789  if (iter || !NIL_P(hash)) {
3790  if (iter) {
3791  val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3792  }
3793  else {
3794  val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3795  val = rb_obj_as_string(val);
3796  }
3797  str_mod_check(str, sp, slen);
3798  if (val == dest) { /* paranoid check [ruby-dev:24827] */
3799  rb_raise(rb_eRuntimeError, "block should not cheat");
3800  }
3801  }
3802  else {
3803  val = rb_reg_regsub(repl, str, regs, pat);
3804  }
3805 
3806  if (OBJ_TAINTED(val)) tainted = 1;
3807 
3808  len = beg - offset; /* copy pre-match substr */
3809  if (len) {
3810  rb_enc_str_buf_cat(dest, cp, len, str_enc);
3811  }
3812 
3813  rb_str_buf_append(dest, val);
3814 
3815  last = offset;
3816  offset = end0;
3817  if (beg0 == end0) {
3818  /*
3819  * Always consume at least one character of the input string
3820  * in order to prevent infinite loops.
3821  */
3822  if (RSTRING_LEN(str) <= end0) break;
3823  len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3824  rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3825  offset = end0 + len;
3826  }
3827  cp = RSTRING_PTR(str) + offset;
3828  if (offset > RSTRING_LEN(str)) break;
3829  beg = rb_reg_search(pat, str, offset, 0);
3830  } while (beg >= 0);
3831  if (RSTRING_LEN(str) > offset) {
3832  rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3833  }
3834  rb_reg_search(pat, str, last, 0);
3835  if (bang) {
3836  rb_str_shared_replace(str, dest);
3837  }
3838  else {
3839  RBASIC(dest)->klass = rb_obj_class(str);
3840  OBJ_INFECT(dest, str);
3841  str = dest;
3842  }
3843 
3844  if (tainted) OBJ_TAINT(str);
3845  return str;
3846 }
3847 
3848 
3849 /*
3850  * call-seq:
3851  * str.gsub!(pattern, replacement) -> str or nil
3852  * str.gsub!(pattern) {|match| block } -> str or nil
3853  * str.gsub!(pattern) -> an_enumerator
3854  *
3855  * Performs the substitutions of <code>String#gsub</code> in place, returning
3856  * <i>str</i>, or <code>nil</code> if no substitutions were performed.
3857  * If no block and no <i>replacement</i> is given, an enumerator is returned instead.
3858  */
3859 
3860 static VALUE
3862 {
3863  str_modify_keep_cr(str);
3864  return str_gsub(argc, argv, str, 1);
3865 }
3866 
3867 
3868 /*
3869  * call-seq:
3870  * str.gsub(pattern, replacement) -> new_str
3871  * str.gsub(pattern, hash) -> new_str
3872  * str.gsub(pattern) {|match| block } -> new_str
3873  * str.gsub(pattern) -> enumerator
3874  *
3875  * Returns a copy of <i>str</i> with the <em>all</em> occurrences of
3876  * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
3877  * typically a <code>Regexp</code>; if given as a <code>String</code>, any
3878  * regular expression metacharacters it contains will be interpreted
3879  * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
3880  * instead of a digit.
3881  *
3882  * If <i>replacement</i> is a <code>String</code> it will be substituted for
3883  * the matched text. It may contain back-references to the pattern's capture
3884  * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
3885  * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
3886  * double-quoted string, both back-references must be preceded by an
3887  * additional backslash. However, within <i>replacement</i> the special match
3888  * variables, such as <code>&$</code>, will not refer to the current match.
3889  *
3890  * If the second argument is a <code>Hash</code>, and the matched text is one
3891  * of its keys, the corresponding value is the replacement string.
3892  *
3893  * In the block form, the current match string is passed in as a parameter,
3894  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3895  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3896  * returned by the block will be substituted for the match on each call.
3897  *
3898  * The result inherits any tainting in the original string or any supplied
3899  * replacement string.
3900  *
3901  * When neither a block nor a second argument is supplied, an
3902  * <code>Enumerator</code> is returned.
3903  *
3904  * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*"
3905  * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>"
3906  * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 "
3907  * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}"
3908  * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*"
3909  */
3910 
3911 static VALUE
3913 {
3914  return str_gsub(argc, argv, str, 0);
3915 }
3916 
3917 
3918 /*
3919  * call-seq:
3920  * str.replace(other_str) -> str
3921  *
3922  * Replaces the contents and taintedness of <i>str</i> with the corresponding
3923  * values in <i>other_str</i>.
3924  *
3925  * s = "hello" #=> "hello"
3926  * s.replace "world" #=> "world"
3927  */
3928 
3929 VALUE
3931 {
3932  str_modifiable(str);
3933  if (str == str2) return str;
3934 
3935  StringValue(str2);
3936  str_discard(str);
3937  return str_replace(str, str2);
3938 }
3939 
3940 /*
3941  * call-seq:
3942  * string.clear -> string
3943  *
3944  * Makes string empty.
3945  *
3946  * a = "abcde"
3947  * a.clear #=> ""
3948  */
3949 
3950 static VALUE
3952 {
3953  str_discard(str);
3954  STR_SET_EMBED(str);
3955  STR_SET_EMBED_LEN(str, 0);
3956  RSTRING_PTR(str)[0] = 0;
3957  if (rb_enc_asciicompat(STR_ENC_GET(str)))
3959  else
3961  return str;
3962 }
3963 
3964 /*
3965  * call-seq:
3966  * string.chr -> string
3967  *
3968  * Returns a one-character string at the beginning of the string.
3969  *
3970  * a = "abcde"
3971  * a.chr #=> "a"
3972  */
3973 
3974 static VALUE
3976 {
3977  return rb_str_substr(str, 0, 1);
3978 }
3979 
3980 /*
3981  * call-seq:
3982  * str.getbyte(index) -> 0 .. 255
3983  *
3984  * returns the <i>index</i>th byte as an integer.
3985  */
3986 static VALUE
3988 {
3989  long pos = NUM2LONG(index);
3990 
3991  if (pos < 0)
3992  pos += RSTRING_LEN(str);
3993  if (pos < 0 || RSTRING_LEN(str) <= pos)
3994  return Qnil;
3995 
3996  return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
3997 }
3998 
3999 /*
4000  * call-seq:
4001  * str.setbyte(index, int) -> int
4002  *
4003  * modifies the <i>index</i>th byte as <i>int</i>.
4004  */
4005 static VALUE
4006 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
4007 {
4008  long pos = NUM2LONG(index);
4009  int byte = NUM2INT(value);
4010 
4011  rb_str_modify(str);
4012 
4013  if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
4014  rb_raise(rb_eIndexError, "index %ld out of string", pos);
4015  if (pos < 0)
4016  pos += RSTRING_LEN(str);
4017 
4018  RSTRING_PTR(str)[pos] = byte;
4019 
4020  return value;
4021 }
4022 
4023 static VALUE
4024 str_byte_substr(VALUE str, long beg, long len)
4025 {
4026  char *p, *s = RSTRING_PTR(str);
4027  long n = RSTRING_LEN(str);
4028  VALUE str2;
4029 
4030  if (beg > n || len < 0) return Qnil;
4031  if (beg < 0) {
4032  beg += n;
4033  if (beg < 0) return Qnil;
4034  }
4035  if (beg + len > n)
4036  len = n - beg;
4037  if (len <= 0) {
4038  len = 0;
4039  p = 0;
4040  }
4041  else
4042  p = s + beg;
4043 
4044  if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
4045  str2 = rb_str_new4(str);
4046  str2 = str_new3(rb_obj_class(str2), str2);
4047  RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
4048  RSTRING(str2)->as.heap.len = len;
4049  }
4050  else {
4051  str2 = rb_str_new5(str, p, len);
4052  }
4053 
4054  str_enc_copy(str2, str);
4055 
4056  if (RSTRING_LEN(str2) == 0) {
4057  if (!rb_enc_asciicompat(STR_ENC_GET(str)))
4059  else
4061  }
4062  else {
4063  switch (ENC_CODERANGE(str)) {
4064  case ENC_CODERANGE_7BIT:
4066  break;
4067  default:
4069  break;
4070  }
4071  }
4072 
4073  OBJ_INFECT(str2, str);
4074 
4075  return str2;
4076 }
4077 
4078 static VALUE
4080 {
4081  long idx;
4082  switch (TYPE(indx)) {
4083  case T_FIXNUM:
4084  idx = FIX2LONG(indx);
4085 
4086  num_index:
4087  str = str_byte_substr(str, idx, 1);
4088  if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
4089  return str;
4090 
4091  default:
4092  /* check if indx is Range */
4093  {
4094  long beg, len = RSTRING_LEN(str);
4095 
4096  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4097  case Qfalse:
4098  break;
4099  case Qnil:
4100  return Qnil;
4101  default:
4102  return str_byte_substr(str, beg, len);
4103  }
4104  }
4105  idx = NUM2LONG(indx);
4106  goto num_index;
4107  }
4108  return Qnil; /* not reached */
4109 }
4110 
4111 /*
4112  * call-seq:
4113  * str.byteslice(fixnum) -> new_str or nil
4114  * str.byteslice(fixnum, fixnum) -> new_str or nil
4115  * str.byteslice(range) -> new_str or nil
4116  *
4117  * Byte Reference---If passed a single <code>Fixnum</code>, returns a
4118  * substring of one byte at that position. If passed two <code>Fixnum</code>
4119  * objects, returns a substring starting at the offset given by the first, and
4120  * a length given by the second. If given a <code>Range</code>, a substring containing
4121  * bytes at offsets given by the range is returned. In all three cases, if
4122  * an offset is negative, it is counted from the end of <i>str</i>. Returns
4123  * <code>nil</code> if the initial offset falls outside the string, the length
4124  * is negative, or the beginning of the range is greater than the end.
4125  * The encoding of the resulted string keeps original encoding.
4126  *
4127  * "hello".byteslice(1) #=> "e"
4128  * "hello".byteslice(-1) #=> "o"
4129  * "hello".byteslice(1, 2) #=> "el"
4130  * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
4131  * "\x03\u3042\xff".byteslice(1..3) #=> "\u3942"
4132  */
4133 
4134 static VALUE
4136 {
4137  if (argc == 2) {
4138  return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
4139  }
4140  if (argc != 1) {
4141  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
4142  }
4143  return str_byte_aref(str, argv[0]);
4144 }
4145 
4146 /*
4147  * call-seq:
4148  * str.reverse -> new_str
4149  *
4150  * Returns a new string with the characters from <i>str</i> in reverse order.
4151  *
4152  * "stressed".reverse #=> "desserts"
4153  */
4154 
4155 static VALUE
4157 {
4158  rb_encoding *enc;
4159  VALUE rev;
4160  char *s, *e, *p;
4161  int single = 1;
4162 
4163  if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
4164  enc = STR_ENC_GET(str);
4165  rev = rb_str_new5(str, 0, RSTRING_LEN(str));
4166  s = RSTRING_PTR(str); e = RSTRING_END(str);
4167  p = RSTRING_END(rev);
4168 
4169  if (RSTRING_LEN(str) > 1) {
4170  if (single_byte_optimizable(str)) {
4171  while (s < e) {
4172  *--p = *s++;
4173  }
4174  }
4175  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
4176  while (s < e) {
4177  int clen = rb_enc_fast_mbclen(s, e, enc);
4178 
4179  if (clen > 1 || (*s & 0x80)) single = 0;
4180  p -= clen;
4181  memcpy(p, s, clen);
4182  s += clen;
4183  }
4184  }
4185  else {
4186  while (s < e) {
4187  int clen = rb_enc_mbclen(s, e, enc);
4188 
4189  if (clen > 1 || (*s & 0x80)) single = 0;
4190  p -= clen;
4191  memcpy(p, s, clen);
4192  s += clen;
4193  }
4194  }
4195  }
4196  STR_SET_LEN(rev, RSTRING_LEN(str));
4197  OBJ_INFECT(rev, str);
4198  if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
4199  if (single) {
4201  }
4202  else {
4204  }
4205  }
4207 
4208  return rev;
4209 }
4210 
4211 
4212 /*
4213  * call-seq:
4214  * str.reverse! -> str
4215  *
4216  * Reverses <i>str</i> in place.
4217  */
4218 
4219 static VALUE
4221 {
4222  if (RSTRING_LEN(str) > 1) {
4223  if (single_byte_optimizable(str)) {
4224  char *s, *e, c;
4225 
4226  str_modify_keep_cr(str);
4227  s = RSTRING_PTR(str);
4228  e = RSTRING_END(str) - 1;
4229  while (s < e) {
4230  c = *s;
4231  *s++ = *e;
4232  *e-- = c;
4233  }
4234  }
4235  else {
4237  }
4238  }
4239  else {
4240  str_modify_keep_cr(str);
4241  }
4242  return str;
4243 }
4244 
4245 
4246 /*
4247  * call-seq:
4248  * str.include? other_str -> true or false
4249  *
4250  * Returns <code>true</code> if <i>str</i> contains the given string or
4251  * character.
4252  *
4253  * "hello".include? "lo" #=> true
4254  * "hello".include? "ol" #=> false
4255  * "hello".include? ?h #=> true
4256  */
4257 
4258 static VALUE
4260 {
4261  long i;
4262 
4263  StringValue(arg);
4264  i = rb_str_index(str, arg, 0);
4265 
4266  if (i == -1) return Qfalse;
4267  return Qtrue;
4268 }
4269 
4270 
4271 /*
4272  * call-seq:
4273  * str.to_i(base=10) -> integer
4274  *
4275  * Returns the result of interpreting leading characters in <i>str</i> as an
4276  * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
4277  * end of a valid number are ignored. If there is not a valid number at the
4278  * start of <i>str</i>, <code>0</code> is returned. This method never raises an
4279  * exception when <i>base</i> is valid.
4280  *
4281  * "12345".to_i #=> 12345
4282  * "99 red balloons".to_i #=> 99
4283  * "0a".to_i #=> 0
4284  * "0a".to_i(16) #=> 10
4285  * "hello".to_i #=> 0
4286  * "1100101".to_i(2) #=> 101
4287  * "1100101".to_i(8) #=> 294977
4288  * "1100101".to_i(10) #=> 1100101
4289  * "1100101".to_i(16) #=> 17826049
4290  */
4291 
4292 static VALUE
4294 {
4295  int base;
4296 
4297  if (argc == 0) base = 10;
4298  else {
4299  VALUE b;
4300 
4301  rb_scan_args(argc, argv, "01", &b);
4302  base = NUM2INT(b);
4303  }
4304  if (base < 0) {
4305  rb_raise(rb_eArgError, "invalid radix %d", base);
4306  }
4307  return rb_str_to_inum(str, base, FALSE);
4308 }
4309 
4310 
4311 /*
4312  * call-seq:
4313  * str.to_f -> float
4314  *
4315  * Returns the result of interpreting leading characters in <i>str</i> as a
4316  * floating point number. Extraneous characters past the end of a valid number
4317  * are ignored. If there is not a valid number at the start of <i>str</i>,
4318  * <code>0.0</code> is returned. This method never raises an exception.
4319  *
4320  * "123.45e1".to_f #=> 1234.5
4321  * "45.67 degrees".to_f #=> 45.67
4322  * "thx1138".to_f #=> 0.0
4323  */
4324 
4325 static VALUE
4327 {
4328  return DBL2NUM(rb_str_to_dbl(str, FALSE));
4329 }
4330 
4331 
4332 /*
4333  * call-seq:
4334  * str.to_s -> str
4335  * str.to_str -> str
4336  *
4337  * Returns the receiver.
4338  */
4339 
4340 static VALUE
4342 {
4343  if (rb_obj_class(str) != rb_cString) {
4344  return str_duplicate(rb_cString, str);
4345  }
4346  return str;
4347 }
4348 
4349 #if 0
4350 static void
4351 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
4352 {
4353  char s[RUBY_MAX_CHAR_LEN];
4354  int n = rb_enc_codelen(c, enc);
4355 
4356  rb_enc_mbcput(c, s, enc);
4357  rb_enc_str_buf_cat(str, s, n, enc);
4358 }
4359 #endif
4360 
4361 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
4362 
4363 int
4364 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
4365 {
4366  char buf[CHAR_ESC_LEN + 1];
4367  int l;
4368 
4369 #if SIZEOF_INT > 4
4370  c &= 0xffffffff;
4371 #endif
4372  if (unicode_p) {
4373  if (c < 0x7F && ISPRINT(c)) {
4374  snprintf(buf, CHAR_ESC_LEN, "%c", c);
4375  }
4376  else if (c < 0x10000) {
4377  snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
4378  }
4379  else {
4380  snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
4381  }
4382  }
4383  else {
4384  if (c < 0x100) {
4385  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
4386  }
4387  else {
4388  snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
4389  }
4390  }
4391  l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
4392  rb_str_buf_cat(result, buf, l);
4393  return l;
4394 }
4395 
4396 /*
4397  * call-seq:
4398  * str.inspect -> string
4399  *
4400  * Returns a printable version of _str_, surrounded by quote marks,
4401  * with special characters escaped.
4402  *
4403  * str = "hello"
4404  * str[3] = "\b"
4405  * str.inspect #=> "\"hel\\bo\""
4406  */
4407 
4408 VALUE
4410 {
4411  rb_encoding *enc = STR_ENC_GET(str);
4412  const char *p, *pend, *prev;
4413  char buf[CHAR_ESC_LEN + 1];
4416  int unicode_p = rb_enc_unicode_p(enc);
4417  int asciicompat = rb_enc_asciicompat(enc);
4418  static rb_encoding *utf16, *utf32;
4419 
4420  if (!utf16) utf16 = rb_enc_find("UTF-16");
4421  if (!utf32) utf32 = rb_enc_find("UTF-32");
4422  if (resenc == NULL) resenc = rb_default_external_encoding();
4423  if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
4424  rb_enc_associate(result, resenc);
4425  str_buf_cat2(result, "\"");
4426 
4427  p = RSTRING_PTR(str); pend = RSTRING_END(str);
4428  prev = p;
4429  if (enc == utf16) {
4430  const unsigned char *q = (const unsigned char *)p;
4431  if (q[0] == 0xFE && q[1] == 0xFF)
4432  enc = rb_enc_find("UTF-16BE");
4433  else if (q[0] == 0xFF && q[1] == 0xFE)
4434  enc = rb_enc_find("UTF-16LE");
4435  else
4436  unicode_p = 0;
4437  }
4438  else if (enc == utf32) {
4439  const unsigned char *q = (const unsigned char *)p;
4440  if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
4441  enc = rb_enc_find("UTF-32BE");
4442  else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
4443  enc = rb_enc_find("UTF-32LE");
4444  else
4445  unicode_p = 0;
4446  }
4447  while (p < pend) {
4448  unsigned int c, cc;
4449  int n;
4450 
4451  n = rb_enc_precise_mbclen(p, pend, enc);
4452  if (!MBCLEN_CHARFOUND_P(n)) {
4453  if (p > prev) str_buf_cat(result, prev, p - prev);
4454  n = rb_enc_mbminlen(enc);
4455  if (pend < p + n)
4456  n = (int)(pend - p);
4457  while (n--) {
4458  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
4459  str_buf_cat(result, buf, strlen(buf));
4460  prev = ++p;
4461  }
4462  continue;
4463  }
4464  n = MBCLEN_CHARFOUND_LEN(n);
4465  c = rb_enc_mbc_to_codepoint(p, pend, enc);
4466  p += n;
4467  if ((asciicompat || unicode_p) &&
4468  (c == '"'|| c == '\\' ||
4469  (c == '#' &&
4470  p < pend &&
4472  (cc = rb_enc_codepoint(p,pend,enc),
4473  (cc == '$' || cc == '@' || cc == '{'))))) {
4474  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4475  str_buf_cat2(result, "\\");
4476  if (asciicompat || enc == resenc) {
4477  prev = p - n;
4478  continue;
4479  }
4480  }
4481  switch (c) {
4482  case '\n': cc = 'n'; break;
4483  case '\r': cc = 'r'; break;
4484  case '\t': cc = 't'; break;
4485  case '\f': cc = 'f'; break;
4486  case '\013': cc = 'v'; break;
4487  case '\010': cc = 'b'; break;
4488  case '\007': cc = 'a'; break;
4489  case 033: cc = 'e'; break;
4490  default: cc = 0; break;
4491  }
4492  if (cc) {
4493  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4494  buf[0] = '\\';
4495  buf[1] = (char)cc;
4496  str_buf_cat(result, buf, 2);
4497  prev = p;
4498  continue;
4499  }
4500  if ((enc == resenc && rb_enc_isprint(c, enc)) ||
4501  (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
4502  continue;
4503  }
4504  else {
4505  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4506  rb_str_buf_cat_escaped_char(result, c, unicode_p);
4507  prev = p;
4508  continue;
4509  }
4510  }
4511  if (p > prev) str_buf_cat(result, prev, p - prev);
4512  str_buf_cat2(result, "\"");
4513 
4514  OBJ_INFECT(result, str);
4515  return result;
4516 }
4517 
4518 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
4519 
4520 /*
4521  * call-seq:
4522  * str.dump -> new_str
4523  *
4524  * Produces a version of <i>str</i> with all nonprinting characters replaced by
4525  * <code>\nnn</code> notation and all special characters escaped.
4526  */
4527 
4528 VALUE
4530 {
4531  rb_encoding *enc = rb_enc_get(str);
4532  long len;
4533  const char *p, *pend;
4534  char *q, *qend;
4535  VALUE result;
4536  int u8 = (enc == rb_utf8_encoding());
4537 
4538  len = 2; /* "" */
4539  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4540  while (p < pend) {
4541  unsigned char c = *p++;
4542  switch (c) {
4543  case '"': case '\\':
4544  case '\n': case '\r':
4545  case '\t': case '\f':
4546  case '\013': case '\010': case '\007': case '\033':
4547  len += 2;
4548  break;
4549 
4550  case '#':
4551  len += IS_EVSTR(p, pend) ? 2 : 1;
4552  break;
4553 
4554  default:
4555  if (ISPRINT(c)) {
4556  len++;
4557  }
4558  else {
4559  if (u8) { /* \u{NN} */
4560  int n = rb_enc_precise_mbclen(p-1, pend, enc);
4561  if (MBCLEN_CHARFOUND_P(n-1)) {
4562  unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4563  while (cc >>= 4) len++;
4564  len += 5;
4565  p += MBCLEN_CHARFOUND_LEN(n)-1;
4566  break;
4567  }
4568  }
4569  len += 4; /* \xNN */
4570  }
4571  break;
4572  }
4573  }
4574  if (!rb_enc_asciicompat(enc)) {
4575  len += 19; /* ".force_encoding('')" */
4576  len += strlen(enc->name);
4577  }
4578 
4579  result = rb_str_new5(str, 0, len);
4580  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4581  q = RSTRING_PTR(result); qend = q + len + 1;
4582 
4583  *q++ = '"';
4584  while (p < pend) {
4585  unsigned char c = *p++;
4586 
4587  if (c == '"' || c == '\\') {
4588  *q++ = '\\';
4589  *q++ = c;
4590  }
4591  else if (c == '#') {
4592  if (IS_EVSTR(p, pend)) *q++ = '\\';
4593  *q++ = '#';
4594  }
4595  else if (c == '\n') {
4596  *q++ = '\\';
4597  *q++ = 'n';
4598  }
4599  else if (c == '\r') {
4600  *q++ = '\\';
4601  *q++ = 'r';
4602  }
4603  else if (c == '\t') {
4604  *q++ = '\\';
4605  *q++ = 't';
4606  }
4607  else if (c == '\f') {
4608  *q++ = '\\';
4609  *q++ = 'f';
4610  }
4611  else if (c == '\013') {
4612  *q++ = '\\';
4613  *q++ = 'v';
4614  }
4615  else if (c == '\010') {
4616  *q++ = '\\';
4617  *q++ = 'b';
4618  }
4619  else if (c == '\007') {
4620  *q++ = '\\';
4621  *q++ = 'a';
4622  }
4623  else if (c == '\033') {
4624  *q++ = '\\';
4625  *q++ = 'e';
4626  }
4627  else if (ISPRINT(c)) {
4628  *q++ = c;
4629  }
4630  else {
4631  *q++ = '\\';
4632  if (u8) {
4633  int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
4634  if (MBCLEN_CHARFOUND_P(n)) {
4635  int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4636  p += n;
4637  snprintf(q, qend-q, "u{%x}", cc);
4638  q += strlen(q);
4639  continue;
4640  }
4641  }
4642  snprintf(q, qend-q, "x%02X", c);
4643  q += 3;
4644  }
4645  }
4646  *q++ = '"';
4647  *q = '\0';
4648  if (!rb_enc_asciicompat(enc)) {
4649  snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
4650  enc = rb_ascii8bit_encoding();
4651  }
4652  OBJ_INFECT(result, str);
4653  /* result from dump is ASCII */
4654  rb_enc_associate(result, enc);
4656  return result;
4657 }
4658 
4659 
4660 static void
4662 {
4663  if (rb_enc_dummy_p(enc)) {
4664  rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
4665  rb_enc_name(enc));
4666  }
4667 }
4668 
4669 /*
4670  * call-seq:
4671  * str.upcase! -> str or nil
4672  *
4673  * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
4674  * were made.
4675  * Note: case replacement is effective only in ASCII region.
4676  */
4677 
4678 static VALUE
4680 {
4681  rb_encoding *enc;
4682  char *s, *send;
4683  int modify = 0;
4684  int n;
4685 
4686  str_modify_keep_cr(str);
4687  enc = STR_ENC_GET(str);
4689  s = RSTRING_PTR(str); send = RSTRING_END(str);
4690  if (single_byte_optimizable(str)) {
4691  while (s < send) {
4692  unsigned int c = *(unsigned char*)s;
4693 
4694  if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4695  *s = 'A' + (c - 'a');
4696  modify = 1;
4697  }
4698  s++;
4699  }
4700  }
4701  else {
4702  int ascompat = rb_enc_asciicompat(enc);
4703 
4704  while (s < send) {
4705  unsigned int c;
4706 
4707  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4708  if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4709  *s = 'A' + (c - 'a');
4710  modify = 1;
4711  }
4712  s++;
4713  }
4714  else {
4715  c = rb_enc_codepoint_len(s, send, &n, enc);
4716  if (rb_enc_islower(c, enc)) {
4717  /* assuming toupper returns codepoint with same size */
4718  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4719  modify = 1;
4720  }
4721  s += n;
4722  }
4723  }
4724  }
4725 
4726  if (modify) return str;
4727  return Qnil;
4728 }
4729 
4730 
4731 /*
4732  * call-seq:
4733  * str.upcase -> new_str
4734  *
4735  * Returns a copy of <i>str</i> with all lowercase letters replaced with their
4736  * uppercase counterparts. The operation is locale insensitive---only
4737  * characters ``a'' to ``z'' are affected.
4738  * Note: case replacement is effective only in ASCII region.
4739  *
4740  * "hEllO".upcase #=> "HELLO"
4741  */
4742 
4743 static VALUE
4745 {
4746  str = rb_str_dup(str);
4747  rb_str_upcase_bang(str);
4748  return str;
4749 }
4750 
4751 
4752 /*
4753  * call-seq:
4754  * str.downcase! -> str or nil
4755  *
4756  * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4757  * changes were made.
4758  * Note: case replacement is effective only in ASCII region.
4759  */
4760 
4761 static VALUE
4763 {
4764  rb_encoding *enc;
4765  char *s, *send;
4766  int modify = 0;
4767 
4768  str_modify_keep_cr(str);
4769  enc = STR_ENC_GET(str);
4771  s = RSTRING_PTR(str); send = RSTRING_END(str);
4772  if (single_byte_optimizable(str)) {
4773  while (s < send) {
4774  unsigned int c = *(unsigned char*)s;
4775 
4776  if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4777  *s = 'a' + (c - 'A');
4778  modify = 1;
4779  }
4780  s++;
4781  }
4782  }
4783  else {
4784  int ascompat = rb_enc_asciicompat(enc);
4785 
4786  while (s < send) {
4787  unsigned int c;
4788  int n;
4789 
4790  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4791  if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4792  *s = 'a' + (c - 'A');
4793  modify = 1;
4794  }
4795  s++;
4796  }
4797  else {
4798  c = rb_enc_codepoint_len(s, send, &n, enc);
4799  if (rb_enc_isupper(c, enc)) {
4800  /* assuming toupper returns codepoint with same size */
4801  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4802  modify = 1;
4803  }
4804  s += n;
4805  }
4806  }
4807  }
4808 
4809  if (modify) return str;
4810  return Qnil;
4811 }
4812 
4813 
4814 /*
4815  * call-seq:
4816  * str.downcase -> new_str
4817  *
4818  * Returns a copy of <i>str</i> with all uppercase letters replaced with their
4819  * lowercase counterparts. The operation is locale insensitive---only
4820  * characters ``A'' to ``Z'' are affected.
4821  * Note: case replacement is effective only in ASCII region.
4822  *
4823  * "hEllO".downcase #=> "hello"
4824  */
4825 
4826 static VALUE
4828 {
4829  str = rb_str_dup(str);
4830  rb_str_downcase_bang(str);
4831  return str;
4832 }
4833 
4834 
4835 /*
4836  * call-seq:
4837  * str.capitalize! -> str or nil
4838  *
4839  * Modifies <i>str</i> by converting the first character to uppercase and the
4840  * remainder to lowercase. Returns <code>nil</code> if no changes are made.
4841  * Note: case conversion is effective only in ASCII region.
4842  *
4843  * a = "hello"
4844  * a.capitalize! #=> "Hello"
4845  * a #=> "Hello"
4846  * a.capitalize! #=> nil
4847  */
4848 
4849 static VALUE
4851 {
4852  rb_encoding *enc;
4853  char *s, *send;
4854  int modify = 0;
4855  unsigned int c;
4856  int n;
4857 
4858  str_modify_keep_cr(str);
4859  enc = STR_ENC_GET(str);
4861  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4862  s = RSTRING_PTR(str); send = RSTRING_END(str);
4863 
4864  c = rb_enc_codepoint_len(s, send, &n, enc);
4865  if (rb_enc_islower(c, enc)) {
4866  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4867  modify = 1;
4868  }
4869  s += n;
4870  while (s < send) {
4871  c = rb_enc_codepoint_len(s, send, &n, enc);
4872  if (rb_enc_isupper(c, enc)) {
4873  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4874  modify = 1;
4875  }
4876  s += n;
4877  }
4878 
4879  if (modify) return str;
4880  return Qnil;
4881 }
4882 
4883 
4884 /*
4885  * call-seq:
4886  * str.capitalize -> new_str
4887  *
4888  * Returns a copy of <i>str</i> with the first character converted to uppercase
4889  * and the remainder to lowercase.
4890  * Note: case conversion is effective only in ASCII region.
4891  *
4892  * "hello".capitalize #=> "Hello"
4893  * "HELLO".capitalize #=> "Hello"
4894  * "123ABC".capitalize #=> "123abc"
4895  */
4896 
4897 static VALUE
4899 {
4900  str = rb_str_dup(str);
4902  return str;
4903 }
4904 
4905 
4906 /*
4907  * call-seq:
4908  * str.swapcase! -> str or nil
4909  *
4910  * Equivalent to <code>String#swapcase</code>, but modifies the receiver in
4911  * place, returning <i>str</i>, or <code>nil</code> if no changes were made.
4912  * Note: case conversion is effective only in ASCII region.
4913  */
4914 
4915 static VALUE
4917 {
4918  rb_encoding *enc;
4919  char *s, *send;
4920  int modify = 0;
4921  int n;
4922 
4923  str_modify_keep_cr(str);
4924  enc = STR_ENC_GET(str);
4926  s = RSTRING_PTR(str); send = RSTRING_END(str);
4927  while (s < send) {
4928  unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
4929 
4930  if (rb_enc_isupper(c, enc)) {
4931  /* assuming toupper returns codepoint with same size */
4932  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4933  modify = 1;
4934  }
4935  else if (rb_enc_islower(c, enc)) {
4936  /* assuming tolower returns codepoint with same size */
4937  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4938  modify = 1;
4939  }
4940  s += n;
4941  }
4942 
4943  if (modify) return str;
4944  return Qnil;
4945 }
4946 
4947 
4948 /*
4949  * call-seq:
4950  * str.swapcase -> new_str
4951  *
4952  * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
4953  * to lowercase and lowercase characters converted to uppercase.
4954  * Note: case conversion is effective only in ASCII region.
4955  *
4956  * "Hello".swapcase #=> "hELLO"
4957  * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
4958  */
4959 
4960 static VALUE
4962 {
4963  str = rb_str_dup(str);
4964  rb_str_swapcase_bang(str);
4965  return str;
4966 }
4967 
4968 typedef unsigned char *USTR;
4969 
4970 struct tr {
4971  int gen;
4972  unsigned int now, max;
4973  char *p, *pend;
4974 };
4975 
4976 static unsigned int
4977 trnext(struct tr *t, rb_encoding *enc)
4978 {
4979  int n;
4980 
4981  for (;;) {
4982  if (!t->gen) {
4983  if (t->p == t->pend) return -1;
4984  if (t->p < t->pend - 1 && *t->p == '\\') {
4985  t->p++;
4986  }
4987  t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
4988  t->p += n;
4989  if (t->p < t->pend - 1 && *t->p == '-') {
4990  t->p++;
4991  if (t->p < t->pend) {
4992  unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
4993  t->p += n;
4994  if (t->now > c) {
4995  if (t->now < 0x80 && c < 0x80) {
4997  "invalid range \"%c-%c\" in string transliteration",
4998  t->now, c);
4999  }
5000  else {
5001  rb_raise(rb_eArgError, "invalid range in string transliteration");
5002  }
5003  continue; /* not reached */
5004  }
5005  t->gen = 1;
5006  t->max = c;
5007  }
5008  }
5009  return t->now;
5010  }
5011  else if (++t->now < t->max) {
5012  return t->now;
5013  }
5014  else {
5015  t->gen = 0;
5016  return t->max;
5017  }
5018  }
5019 }
5020 
5021 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
5022 
5023 static VALUE
5024 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
5025 {
5026  const unsigned int errc = -1;
5027  unsigned int trans[256];
5028  rb_encoding *enc, *e1, *e2;
5029  struct tr trsrc, trrepl;
5030  int cflag = 0;
5031  unsigned int c, c0, last = 0;
5032  int modify = 0, i, l;
5033  char *s, *send;
5034  VALUE hash = 0;
5035  int singlebyte = single_byte_optimizable(str);
5036  int cr;
5037 
5038 #define CHECK_IF_ASCII(c) \
5039  (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
5040  (cr = ENC_CODERANGE_VALID) : 0)
5041 
5042  StringValue(src);
5043  StringValue(repl);
5044  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5045  if (RSTRING_LEN(repl) == 0) {
5046  return rb_str_delete_bang(1, &src, str);
5047  }
5048 
5049  cr = ENC_CODERANGE(str);
5050  e1 = rb_enc_check(str, src);
5051  e2 = rb_enc_check(str, repl);
5052  if (e1 == e2) {
5053  enc = e1;
5054  }
5055  else {
5056  enc = rb_enc_check(src, repl);
5057  }
5058  trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
5059  if (RSTRING_LEN(src) > 1 &&
5060  rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
5061  trsrc.p + l < trsrc.pend) {
5062  cflag = 1;
5063  trsrc.p += l;
5064  }
5065  trrepl.p = RSTRING_PTR(repl);
5066  trrepl.pend = trrepl.p + RSTRING_LEN(repl);
5067  trsrc.gen = trrepl.gen = 0;
5068  trsrc.now = trrepl.now = 0;
5069  trsrc.max = trrepl.max = 0;
5070 
5071  if (cflag) {
5072  for (i=0; i<256; i++) {
5073  trans[i] = 1;
5074  }
5075  while ((c = trnext(&trsrc, enc)) != errc) {
5076  if (c < 256) {
5077  trans[c] = errc;
5078  }
5079  else {
5080  if (!hash) hash = rb_hash_new();
5081  rb_hash_aset(hash, UINT2NUM(c), Qtrue);
5082  }
5083  }
5084  while ((c = trnext(&trrepl, enc)) != errc)
5085  /* retrieve last replacer */;
5086  last = trrepl.now;
5087  for (i=0; i<256; i++) {
5088  if (trans[i] != errc) {
5089  trans[i] = last;
5090  }
5091  }
5092  }
5093  else {
5094  unsigned int r;
5095 
5096  for (i=0; i<256; i++) {
5097  trans[i] = errc;
5098  }
5099  while ((c = trnext(&trsrc, enc)) != errc) {
5100  r = trnext(&trrepl, enc);
5101  if (r == errc) r = trrepl.now;
5102  if (c < 256) {
5103  trans[c] = r;
5104  if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
5105  }
5106  else {
5107  if (!hash) hash = rb_hash_new();
5108  rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
5109  }
5110  }
5111  }
5112 
5113  if (cr == ENC_CODERANGE_VALID)
5114  cr = ENC_CODERANGE_7BIT;
5115  str_modify_keep_cr(str);
5116  s = RSTRING_PTR(str); send = RSTRING_END(str);
5117  if (sflag) {
5118  int clen, tlen;
5119  long offset, max = RSTRING_LEN(str);
5120  unsigned int save = -1;
5121  char *buf = ALLOC_N(char, max), *t = buf;
5122 
5123  while (s < send) {
5124  int may_modify = 0;
5125 
5126  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5127  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5128 
5129  s += clen;
5130  if (c < 256) {
5131  c = trans[c];
5132  }
5133  else if (hash) {
5134  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5135  if (NIL_P(tmp)) {
5136  if (cflag) c = last;
5137  else c = errc;
5138  }
5139  else if (cflag) c = errc;
5140  else c = NUM2INT(tmp);
5141  }
5142  else {
5143  c = errc;
5144  }
5145  if (c != (unsigned int)-1) {
5146  if (save == c) {
5147  CHECK_IF_ASCII(c);
5148  continue;
5149  }
5150  save = c;
5151  tlen = rb_enc_codelen(c, enc);
5152  modify = 1;
5153  }
5154  else {
5155  save = -1;
5156  c = c0;
5157  if (enc != e1) may_modify = 1;
5158  }
5159  while (t - buf + tlen >= max) {
5160  offset = t - buf;
5161  max *= 2;
5162  REALLOC_N(buf, char, max);
5163  t = buf + offset;
5164  }
5165  rb_enc_mbcput(c, t, enc);
5166  if (may_modify && memcmp(s, t, tlen) != 0) {
5167  modify = 1;
5168  }
5169  CHECK_IF_ASCII(c);
5170  t += tlen;
5171  }
5172  if (!STR_EMBED_P(str)) {
5173  xfree(RSTRING(str)->as.heap.ptr);
5174  }
5175  *t = '\0';
5176  RSTRING(str)->as.heap.ptr = buf;
5177  RSTRING(str)->as.heap.len = t - buf;
5178  STR_SET_NOEMBED(str);
5179  RSTRING(str)->as.heap.aux.capa = max;
5180  }
5181  else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
5182  while (s < send) {
5183  c = (unsigned char)*s;
5184  if (trans[c] != errc) {
5185  if (!cflag) {
5186  c = trans[c];
5187  *s = c;
5188  modify = 1;
5189  }
5190  else {
5191  *s = last;
5192  modify = 1;
5193  }
5194  }
5195  CHECK_IF_ASCII(c);
5196  s++;
5197  }
5198  }
5199  else {
5200  int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
5201  long offset;
5202  char *buf = ALLOC_N(char, max), *t = buf;
5203 
5204  while (s < send) {
5205  int may_modify = 0;
5206  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5207  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5208 
5209  if (c < 256) {
5210  c = trans[c];
5211  }
5212  else if (hash) {
5213  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5214  if (NIL_P(tmp)) {
5215  if (cflag) c = last;
5216  else c = errc;
5217  }
5218  else if (cflag) c = errc;
5219  else c = NUM2INT(tmp);
5220  }
5221  else {
5222  c = cflag ? last : errc;
5223  }
5224  if (c != errc) {
5225  tlen = rb_enc_codelen(c, enc);
5226  modify = 1;
5227  }
5228  else {
5229  c = c0;
5230  if (enc != e1) may_modify = 1;
5231  }
5232  while (t - buf + tlen >= max) {
5233  offset = t - buf;
5234  max *= 2;
5235  REALLOC_N(buf, char, max);
5236  t = buf + offset;
5237  }
5238  if (s != t) {
5239  rb_enc_mbcput(c, t, enc);
5240  if (may_modify && memcmp(s, t, tlen) != 0) {
5241  modify = 1;
5242  }
5243  }
5244  CHECK_IF_ASCII(c);
5245  s += clen;
5246  t += tlen;
5247  }
5248  if (!STR_EMBED_P(str)) {
5249  xfree(RSTRING(str)->as.heap.ptr);
5250  }
5251  *t = '\0';
5252  RSTRING(str)->as.heap.ptr = buf;
5253  RSTRING(str)->as.heap.len = t - buf;
5254  STR_SET_NOEMBED(str);
5255  RSTRING(str)->as.heap.aux.capa = max;
5256  }
5257 
5258  if (modify) {
5259  if (cr != ENC_CODERANGE_BROKEN)
5260  ENC_CODERANGE_SET(str, cr);
5261  rb_enc_associate(str, enc);
5262  return str;
5263  }
5264  return Qnil;
5265 }
5266 
5267 
5268 /*
5269  * call-seq:
5270  * str.tr!(from_str, to_str) -> str or nil
5271  *
5272  * Translates <i>str</i> in place, using the same rules as
5273  * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
5274  * changes were made.
5275  */
5276 
5277 static VALUE
5279 {
5280  return tr_trans(str, src, repl, 0);
5281 }
5282 
5283 
5284 /*
5285  * call-seq:
5286  * str.tr(from_str, to_str) => new_str
5287  *
5288  * Returns a copy of <i>str</i> with the characters in <i>from_str</i>
5289  * replaced by the corresponding characters in <i>to_str</i>. If
5290  * <i>to_str</i> is shorter than <i>from_str</i>, it is padded with its last
5291  * character in order to maintain the correspondence.
5292  *
5293  * "hello".tr('el', 'ip') #=> "hippo"
5294  * "hello".tr('aeiou', '*') #=> "h*ll*"
5295  *
5296  * Both strings may use the c1-c2 notation to denote ranges of characters,
5297  * and <i>from_str</i> may start with a <code>^</code>, which denotes all
5298  * characters except those listed.
5299  *
5300  * "hello".tr('a-y', 'b-z') #=> "ifmmp"
5301  * "hello".tr('^aeiou', '*') #=> "*e**o"
5302  */
5303 
5304 static VALUE
5305 rb_str_tr(VALUE str, VALUE src, VALUE repl)
5306 {
5307  str = rb_str_dup(str);
5308  tr_trans(str, src, repl, 0);
5309  return str;
5310 }
5311 
5312 #define TR_TABLE_SIZE 257
5313 static void
5314 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
5315  VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
5316 {
5317  const unsigned int errc = -1;
5318  char buf[256];
5319  struct tr tr;
5320  unsigned int c;
5321  VALUE table = 0, ptable = 0;
5322  int i, l, cflag = 0;
5323 
5324  tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
5325  tr.gen = tr.now = tr.max = 0;
5326 
5327  if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
5328  cflag = 1;
5329  tr.p += l;
5330  }
5331  if (first) {
5332  for (i=0; i<256; i++) {
5333  stable[i] = 1;
5334  }
5335  stable[256] = cflag;
5336  }
5337  else if (stable[256] && !cflag) {
5338  stable[256] = 0;
5339  }
5340  for (i=0; i<256; i++) {
5341  buf[i] = cflag;
5342  }
5343 
5344  while ((c = trnext(&tr, enc)) != errc) {
5345  if (c < 256) {
5346  buf[c & 0xff] = !cflag;
5347  }
5348  else {
5349  VALUE key = UINT2NUM(c);
5350 
5351  if (!table) {
5352  table = rb_hash_new();
5353  if (cflag) {
5354  ptable = *ctablep;
5355  *ctablep = table;
5356  }
5357  else {
5358  ptable = *tablep;
5359  *tablep = table;
5360  }
5361  }
5362  if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
5363  rb_hash_aset(table, key, Qtrue);
5364  }
5365  }
5366  }
5367  for (i=0; i<256; i++) {
5368  stable[i] = stable[i] && buf[i];
5369  }
5370 }
5371 
5372 
5373 static int
5374 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
5375 {
5376  if (c < 256) {
5377  return table[c] != 0;
5378  }
5379  else {
5380  VALUE v = UINT2NUM(c);
5381 
5382  if (del) {
5383  if (!NIL_P(rb_hash_lookup(del, v)) &&
5384  (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
5385  return TRUE;
5386  }
5387  }
5388  else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
5389  return FALSE;
5390  }
5391  return table[256] ? TRUE : FALSE;
5392  }
5393 }
5394 
5395 /*
5396  * call-seq:
5397  * str.delete!([other_str]+) -> str or nil
5398  *
5399  * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
5400  * <code>nil</code> if <i>str</i> was not modified.
5401  */
5402 
5403 static VALUE
5405 {
5406  char squeez[TR_TABLE_SIZE];
5407  rb_encoding *enc = 0;
5408  char *s, *send, *t;
5409  VALUE del = 0, nodel = 0;
5410  int modify = 0;
5411  int i, ascompat, cr;
5412 
5413  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5414  if (argc < 1) {
5415  rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
5416  }
5417  for (i=0; i<argc; i++) {
5418  VALUE s = argv[i];
5419 
5420  StringValue(s);
5421  enc = rb_enc_check(str, s);
5422  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5423  }
5424 
5425  str_modify_keep_cr(str);
5426  ascompat = rb_enc_asciicompat(enc);
5427  s = t = RSTRING_PTR(str);
5428  send = RSTRING_END(str);
5429  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
5430  while (s < send) {
5431  unsigned int c;
5432  int clen;
5433 
5434  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5435  if (squeez[c]) {
5436  modify = 1;
5437  }
5438  else {
5439  if (t != s) *t = c;
5440  t++;
5441  }
5442  s++;
5443  }
5444  else {
5445  c = rb_enc_codepoint_len(s, send, &clen, enc);
5446 
5447  if (tr_find(c, squeez, del, nodel)) {
5448  modify = 1;
5449  }
5450  else {
5451  if (t != s) rb_enc_mbcput(c, t, enc);
5452  t += clen;
5453  if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
5454  }
5455  s += clen;
5456  }
5457  }
5458  *t = '\0';
5459  STR_SET_LEN(str, t - RSTRING_PTR(str));
5460  ENC_CODERANGE_SET(str, cr);
5461 
5462  if (modify) return str;
5463  return Qnil;
5464 }
5465 
5466 
5467 /*
5468  * call-seq:
5469  * str.delete([other_str]+) -> new_str
5470  *
5471  * Returns a copy of <i>str</i> with all characters in the intersection of its
5472  * arguments deleted. Uses the same rules for building the set of characters as
5473  * <code>String#count</code>.
5474  *
5475  * "hello".delete "l","lo" #=> "heo"
5476  * "hello".delete "lo" #=> "he"
5477  * "hello".delete "aeiou", "^e" #=> "hell"
5478  * "hello".delete "ej-m" #=> "ho"
5479  */
5480 
5481 static VALUE
5483 {
5484  str = rb_str_dup(str);
5485  rb_str_delete_bang(argc, argv, str);
5486  return str;
5487 }
5488 
5489 
5490 /*
5491  * call-seq:
5492  * str.squeeze!([other_str]*) -> str or nil
5493  *
5494  * Squeezes <i>str</i> in place, returning either <i>str</i>, or
5495  * <code>nil</code> if no changes were made.
5496  */
5497 
5498 static VALUE
5500 {
5501  char squeez[TR_TABLE_SIZE];
5502  rb_encoding *enc = 0;
5503  VALUE del = 0, nodel = 0;
5504  char *s, *send, *t;
5505  int i, modify = 0;
5506  int ascompat, singlebyte = single_byte_optimizable(str);
5507  unsigned int save;
5508 
5509  if (argc == 0) {
5510  enc = STR_ENC_GET(str);
5511  }
5512  else {
5513  for (i=0; i<argc; i++) {
5514  VALUE s = argv[i];
5515 
5516  StringValue(s);
5517  enc = rb_enc_check(str, s);
5518  if (singlebyte && !single_byte_optimizable(s))
5519  singlebyte = 0;
5520  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5521  }
5522  }
5523 
5524  str_modify_keep_cr(str);
5525  s = t = RSTRING_PTR(str);
5526  if (!s || RSTRING_LEN(str) == 0) return Qnil;
5527  send = RSTRING_END(str);
5528  save = -1;
5529  ascompat = rb_enc_asciicompat(enc);
5530 
5531  if (singlebyte) {
5532  while (s < send) {
5533  unsigned int c = *(unsigned char*)s++;
5534  if (c != save || (argc > 0 && !squeez[c])) {
5535  *t++ = save = c;
5536  }
5537  }
5538  } else {
5539  while (s < send) {
5540  unsigned int c;
5541  int clen;
5542 
5543  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5544  if (c != save || (argc > 0 && !squeez[c])) {
5545  *t++ = save = c;
5546  }
5547  s++;
5548  }
5549  else {
5550  c = rb_enc_codepoint_len(s, send, &clen, enc);
5551 
5552  if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
5553  if (t != s) rb_enc_mbcput(c, t, enc);
5554  save = c;
5555  t += clen;
5556  }
5557  s += clen;
5558  }
5559  }
5560  }
5561 
5562  *t = '\0';
5563  if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
5564  STR_SET_LEN(str, t - RSTRING_PTR(str));
5565  modify = 1;
5566  }
5567 
5568  if (modify) return str;
5569  return Qnil;
5570 }
5571 
5572 
5573 /*
5574  * call-seq:
5575  * str.squeeze([other_str]*) -> new_str
5576  *
5577  * Builds a set of characters from the <i>other_str</i> parameter(s) using the
5578  * procedure described for <code>String#count</code>. Returns a new string
5579  * where runs of the same character that occur in this set are replaced by a
5580  * single character. If no arguments are given, all runs of identical
5581  * characters are replaced by a single character.
5582  *
5583  * "yellow moon".squeeze #=> "yelow mon"
5584  * " now is the".squeeze(" ") #=> " now is the"
5585  * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
5586  */
5587 
5588 static VALUE
5590 {
5591  str = rb_str_dup(str);
5592  rb_str_squeeze_bang(argc, argv, str);
5593  return str;
5594 }
5595 
5596 
5597 /*
5598  * call-seq:
5599  * str.tr_s!(from_str, to_str) -> str or nil
5600  *
5601  * Performs <code>String#tr_s</code> processing on <i>str</i> in place,
5602  * returning <i>str</i>, or <code>nil</code> if no changes were made.
5603  */
5604 
5605 static VALUE
5607 {
5608  return tr_trans(str, src, repl, 1);
5609 }
5610 
5611 
5612 /*
5613  * call-seq:
5614  * str.tr_s(from_str, to_str) -> new_str
5615  *
5616  * Processes a copy of <i>str</i> as described under <code>String#tr</code>,
5617  * then removes duplicate characters in regions that were affected by the
5618  * translation.
5619  *
5620  * "hello".tr_s('l', 'r') #=> "hero"
5621  * "hello".tr_s('el', '*') #=> "h*o"
5622  * "hello".tr_s('el', 'hx') #=> "hhxo"
5623  */
5624 
5625 static VALUE
5626 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
5627 {
5628  str = rb_str_dup(str);
5629  tr_trans(str, src, repl, 1);
5630  return str;
5631 }
5632 
5633 
5634 /*
5635  * call-seq:
5636  * str.count([other_str]+) -> fixnum
5637  *
5638  * Each <i>other_str</i> parameter defines a set of characters to count. The
5639  * intersection of these sets defines the characters to count in
5640  * <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
5641  * negated. The sequence c1--c2 means all characters between c1 and c2.
5642  *
5643  * a = "hello world"
5644  * a.count "lo" #=> 5
5645  * a.count "lo", "o" #=> 2
5646  * a.count "hello", "^l" #=> 4
5647  * a.count "ej-m" #=> 4
5648  */
5649 
5650 static VALUE
5652 {
5653  char table[TR_TABLE_SIZE];
5654  rb_encoding *enc = 0;
5655  VALUE del = 0, nodel = 0;
5656  char *s, *send;
5657  int i;
5658  int ascompat;
5659 
5660  if (argc < 1) {
5661  rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
5662  }
5663  for (i=0; i<argc; i++) {
5664  VALUE tstr = argv[i];
5665  unsigned char c;
5666 
5667  StringValue(tstr);
5668  enc = rb_enc_check(str, tstr);
5669  if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
5670  (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
5671  int n = 0;
5672 
5673  s = RSTRING_PTR(str);
5674  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5675  send = RSTRING_END(str);
5676  while (s < send) {
5677  if (*(unsigned char*)s++ == c) n++;
5678  }
5679  return INT2NUM(n);
5680  }
5681  tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
5682  }
5683 
5684  s = RSTRING_PTR(str);
5685  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5686  send = RSTRING_END(str);
5687  ascompat = rb_enc_asciicompat(enc);
5688  i = 0;
5689  while (s < send) {
5690  unsigned int c;
5691 
5692  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5693  if (table[c]) {
5694  i++;
5695  }
5696  s++;
5697  }
5698  else {
5699  int clen;
5700  c = rb_enc_codepoint_len(s, send, &clen, enc);
5701  if (tr_find(c, table, del, nodel)) {
5702  i++;
5703  }
5704  s += clen;
5705  }
5706  }
5707 
5708  return INT2NUM(i);
5709 }
5710 
5711 static const char isspacetable[256] = {
5712  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
5713  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5714  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5715  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5716  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5717  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5718  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5719  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5720  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5721  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5722  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5723  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5724  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5725  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5726  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5727  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5728 };
5729 
5730 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
5731 
5732 /*
5733  * call-seq:
5734  * str.split(pattern=$;, [limit]) -> anArray
5735  *
5736  * Divides <i>str</i> into substrings based on a delimiter, returning an array
5737  * of these substrings.
5738  *
5739  * If <i>pattern</i> is a <code>String</code>, then its contents are used as
5740  * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
5741  * space, <i>str</i> is split on whitespace, with leading whitespace and runs
5742  * of contiguous whitespace characters ignored.
5743  *
5744  * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
5745  * pattern matches. Whenever the pattern matches a zero-length string,
5746  * <i>str</i> is split into individual characters. If <i>pattern</i> contains
5747  * groups, the respective matches will be returned in the array as well.
5748  *
5749  * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If
5750  * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
5751  * split on whitespace as if ` ' were specified.
5752  *
5753  * If the <i>limit</i> parameter is omitted, trailing null fields are
5754  * suppressed. If <i>limit</i> is a positive number, at most that number of
5755  * fields will be returned (if <i>limit</i> is <code>1</code>, the entire
5756  * string is returned as the only entry in an array). If negative, there is no
5757  * limit to the number of fields returned, and trailing null fields are not
5758  * suppressed.
5759  *
5760  * " now's the time".split #=> ["now's", "the", "time"]
5761  * " now's the time".split(' ') #=> ["now's", "the", "time"]
5762  * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
5763  * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
5764  * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
5765  * "hello".split(//, 3) #=> ["h", "e", "llo"]
5766  * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
5767  *
5768  * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
5769  * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
5770  * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
5771  * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
5772  */
5773 
5774 static VALUE
5776 {
5777  rb_encoding *enc;
5778  VALUE spat;
5779  VALUE limit;
5780  enum {awk, string, regexp} split_type;
5781  long beg, end, i = 0;
5782  int lim = 0;
5783  VALUE result, tmp;
5784 
5785  if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
5786  lim = NUM2INT(limit);
5787  if (lim <= 0) limit = Qnil;
5788  else if (lim == 1) {
5789  if (RSTRING_LEN(str) == 0)
5790  return rb_ary_new2(0);
5791  return rb_ary_new3(1, str);
5792  }
5793  i = 1;
5794  }
5795 
5796  enc = STR_ENC_GET(str);
5797  if (NIL_P(spat)) {
5798  if (!NIL_P(rb_fs)) {
5799  spat = rb_fs;
5800  goto fs_set;
5801  }
5802  split_type = awk;
5803  }
5804  else {
5805  fs_set:
5806  if (TYPE(spat) == T_STRING) {
5807  rb_encoding *enc2 = STR_ENC_GET(spat);
5808 
5809  split_type = string;
5810  if (RSTRING_LEN(spat) == 0) {
5811  /* Special case - split into chars */
5812  spat = rb_reg_regcomp(spat);
5813  split_type = regexp;
5814  }
5815  else if (rb_enc_asciicompat(enc2) == 1) {
5816  if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
5817  split_type = awk;
5818  }
5819  }
5820  else {
5821  int l;
5822  if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
5823  RSTRING_LEN(spat) == l) {
5824  split_type = awk;
5825  }
5826  }
5827  }
5828  else {
5829  spat = get_pat(spat, 1);
5830  split_type = regexp;
5831  }
5832  }
5833 
5834  result = rb_ary_new();
5835  beg = 0;
5836  if (split_type == awk) {
5837  char *ptr = RSTRING_PTR(str);
5838  char *eptr = RSTRING_END(str);
5839  char *bptr = ptr;
5840  int skip = 1;
5841  unsigned int c;
5842 
5843  end = beg;
5844  if (is_ascii_string(str)) {
5845  while (ptr < eptr) {
5846  c = (unsigned char)*ptr++;
5847  if (skip) {
5848  if (ascii_isspace(c)) {
5849  beg = ptr - bptr;
5850  }
5851  else {
5852  end = ptr - bptr;
5853  skip = 0;
5854  if (!NIL_P(limit) && lim <= i) break;
5855  }
5856  }
5857  else if (ascii_isspace(c)) {
5858  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5859  skip = 1;
5860  beg = ptr - bptr;
5861  if (!NIL_P(limit)) ++i;
5862  }
5863  else {
5864  end = ptr - bptr;
5865  }
5866  }
5867  }
5868  else {
5869  while (ptr < eptr) {
5870  int n;
5871 
5872  c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
5873  ptr += n;
5874  if (skip) {
5875  if (rb_isspace(c)) {
5876  beg = ptr - bptr;
5877  }
5878  else {
5879  end = ptr - bptr;
5880  skip = 0;
5881  if (!NIL_P(limit) && lim <= i) break;
5882  }
5883  }
5884  else if (rb_isspace(c)) {
5885  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5886  skip = 1;
5887  beg = ptr - bptr;
5888  if (!NIL_P(limit)) ++i;
5889  }
5890  else {
5891  end = ptr - bptr;
5892  }
5893  }
5894  }
5895  }
5896  else if (split_type == string) {
5897  char *ptr = RSTRING_PTR(str);
5898  char *temp = ptr;
5899  char *eptr = RSTRING_END(str);
5900  char *sptr = RSTRING_PTR(spat);
5901  long slen = RSTRING_LEN(spat);
5902 
5903  if (is_broken_string(str)) {
5904  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
5905  }
5906  if (is_broken_string(spat)) {
5907  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
5908  }
5909  enc = rb_enc_check(str, spat);
5910  while (ptr < eptr &&
5911  (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
5912  /* Check we are at the start of a char */
5913  char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
5914  if (t != ptr + end) {
5915  ptr = t;
5916  continue;
5917  }
5918  rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
5919  ptr += end + slen;
5920  if (!NIL_P(limit) && lim <= ++i) break;
5921  }
5922  beg = ptr - temp;
5923  }
5924  else {
5925  char *ptr = RSTRING_PTR(str);
5926  long len = RSTRING_LEN(str);
5927  long start = beg;
5928  long idx;
5929  int last_null = 0;
5930  struct re_registers *regs;
5931 
5932  while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
5933  regs = RMATCH_REGS(rb_backref_get());
5934  if (start == end && BEG(0) == END(0)) {
5935  if (!ptr) {
5936  rb_ary_push(result, str_new_empty(str));
5937  break;
5938  }
5939  else if (last_null == 1) {
5940  rb_ary_push(result, rb_str_subseq(str, beg,
5941  rb_enc_fast_mbclen(ptr+beg,
5942  ptr+len,
5943  enc)));
5944  beg = start;
5945  }
5946  else {
5947  if (ptr+start == ptr+len)
5948  start++;
5949  else
5950  start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
5951  last_null = 1;
5952  continue;
5953  }
5954  }
5955  else {
5956  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5957  beg = start = END(0);
5958  }
5959  last_null = 0;
5960 
5961  for (idx=1; idx < regs->num_regs; idx++) {
5962  if (BEG(idx) == -1) continue;
5963  if (BEG(idx) == END(idx))
5964  tmp = str_new_empty(str);
5965  else
5966  tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
5967  rb_ary_push(result, tmp);
5968  }
5969  if (!NIL_P(limit) && lim <= ++i) break;
5970  }
5971  }
5972  if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
5973  if (RSTRING_LEN(str) == beg)
5974  tmp = str_new_empty(str);
5975  else
5976  tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
5977  rb_ary_push(result, tmp);
5978  }
5979  if (NIL_P(limit) && lim == 0) {
5980  long len;
5981  while ((len = RARRAY_LEN(result)) > 0 &&
5982  (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
5983  rb_ary_pop(result);
5984  }
5985 
5986  return result;
5987 }
5988 
5989 VALUE
5990 rb_str_split(VALUE str, const char *sep0)
5991 {
5992  VALUE sep;
5993 
5994  StringValue(str);
5995  sep = rb_str_new2(sep0);
5996  return rb_str_split_m(1, &sep, str);
5997 }
5998 
5999 
6000 /*
6001  * call-seq:
6002  * str.each_line(separator=$/) {|substr| block } -> str
6003  * str.each_line(separator=$/) -> an_enumerator
6004  *
6005  * str.lines(separator=$/) {|substr| block } -> str
6006  * str.lines(separator=$/) -> an_enumerator
6007  *
6008  * Splits <i>str</i> using the supplied parameter as the record separator
6009  * (<code>$/</code> by default), passing each substring in turn to the supplied
6010  * block. If a zero-length record separator is supplied, the string is split
6011  * into paragraphs delimited by multiple successive newlines.
6012  *
6013  * If no block is given, an enumerator is returned instead.
6014  *
6015  * print "Example one\n"
6016  * "hello\nworld".each_line {|s| p s}
6017  * print "Example two\n"
6018  * "hello\nworld".each_line('l') {|s| p s}
6019  * print "Example three\n"
6020  * "hello\n\n\nworld".each_line('') {|s| p s}
6021  *
6022  * <em>produces:</em>
6023  *
6024  * Example one
6025  * "hello\n"
6026  * "world"
6027  * Example two
6028  * "hel"
6029  * "l"
6030  * "o\nworl"
6031  * "d"
6032  * Example three
6033  * "hello\n\n\n"
6034  * "world"
6035  */
6036 
6037 static VALUE
6039 {
6040  rb_encoding *enc;
6041  VALUE rs;
6042  unsigned int newline;
6043  const char *p, *pend, *s, *ptr;
6044  long len, rslen;
6045  VALUE line;
6046  int n;
6047  VALUE orig = str;
6048 
6049  if (argc == 0) {
6050  rs = rb_rs;
6051  }
6052  else {
6053  rb_scan_args(argc, argv, "01", &rs);
6054  }
6055  RETURN_ENUMERATOR(str, argc, argv);
6056  if (NIL_P(rs)) {
6057  rb_yield(str);
6058  return orig;
6059  }
6060  str = rb_str_new4(str);
6061  ptr = p = s = RSTRING_PTR(str);
6062  pend = p + RSTRING_LEN(str);
6063  len = RSTRING_LEN(str);
6064  StringValue(rs);
6065  if (rs == rb_default_rs) {
6066  enc = rb_enc_get(str);
6067  while (p < pend) {
6068  char *p0;
6069 
6070  p = memchr(p, '\n', pend - p);
6071  if (!p) break;
6072  p0 = rb_enc_left_char_head(s, p, pend, enc);
6073  if (!rb_enc_is_newline(p0, pend, enc)) {
6074  p++;
6075  continue;
6076  }
6077  p = p0 + rb_enc_mbclen(p0, pend, enc);
6078  line = rb_str_new5(str, s, p - s);
6079  OBJ_INFECT(line, str);
6080  rb_enc_cr_str_copy_for_substr(line, str);
6081  rb_yield(line);
6082  str_mod_check(str, ptr, len);
6083  s = p;
6084  }
6085  goto finish;
6086  }
6087 
6088  enc = rb_enc_check(str, rs);
6089  rslen = RSTRING_LEN(rs);
6090  if (rslen == 0) {
6091  newline = '\n';
6092  }
6093  else {
6094  newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
6095  }
6096 
6097  while (p < pend) {
6098  unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
6099 
6100  again:
6101  if (rslen == 0 && c == newline) {
6102  p += n;
6103  if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
6104  goto again;
6105  }
6106  while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
6107  p += n;
6108  }
6109  p -= n;
6110  }
6111  if (c == newline &&
6112  (rslen <= 1 ||
6113  (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
6114  line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
6115  OBJ_INFECT(line, str);
6116  rb_enc_cr_str_copy_for_substr(line, str);
6117  rb_yield(line);
6118  str_mod_check(str, ptr, len);
6119  s = p + (rslen ? rslen : n);
6120  }
6121  p += n;
6122  }
6123 
6124  finish:
6125  if (s != pend) {
6126  line = rb_str_new5(str, s, pend - s);
6127  OBJ_INFECT(line, str);
6128  rb_enc_cr_str_copy_for_substr(line, str);
6129  rb_yield(line);
6130  }
6131 
6132  return orig;
6133 }
6134 
6135 
6136 /*
6137  * call-seq:
6138  * str.bytes {|fixnum| block } -> str
6139  * str.bytes -> an_enumerator
6140  *
6141  * str.each_byte {|fixnum| block } -> str
6142  * str.each_byte -> an_enumerator
6143  *
6144  * Passes each byte in <i>str</i> to the given block, or returns
6145  * an enumerator if no block is given.
6146  *
6147  * "hello".each_byte {|c| print c, ' ' }
6148  *
6149  * <em>produces:</em>
6150  *
6151  * 104 101 108 108 111
6152  */
6153 
6154 static VALUE
6156 {
6157  long i;
6158 
6159  RETURN_ENUMERATOR(str, 0, 0);
6160  for (i=0; i<RSTRING_LEN(str); i++) {
6161  rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6162  }
6163  return str;
6164 }
6165 
6166 
6167 /*
6168  * call-seq:
6169  * str.chars {|cstr| block } -> str
6170  * str.chars -> an_enumerator
6171  *
6172  * str.each_char {|cstr| block } -> str
6173  * str.each_char -> an_enumerator
6174  *
6175  * Passes each character in <i>str</i> to the given block, or returns
6176  * an enumerator if no block is given.
6177  *
6178  * "hello".each_char {|c| print c, ' ' }
6179  *
6180  * <em>produces:</em>
6181  *
6182  * h e l l o
6183  */
6184 
6185 static VALUE
6187 {
6188  VALUE orig = str;
6189  long i, len, n;
6190  const char *ptr;
6191  rb_encoding *enc;
6192 
6193  RETURN_ENUMERATOR(str, 0, 0);
6194  str = rb_str_new4(str);
6195  ptr = RSTRING_PTR(str);
6196  len = RSTRING_LEN(str);
6197  enc = rb_enc_get(str);
6198  switch (ENC_CODERANGE(str)) {
6199  case ENC_CODERANGE_VALID:
6200  case ENC_CODERANGE_7BIT:
6201  for (i = 0; i < len; i += n) {
6202  n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
6203  rb_yield(rb_str_subseq(str, i, n));
6204  }
6205  break;
6206  default:
6207  for (i = 0; i < len; i += n) {
6208  n = rb_enc_mbclen(ptr + i, ptr + len, enc);
6209  rb_yield(rb_str_subseq(str, i, n));
6210  }
6211  }
6212  return orig;
6213 }
6214 
6215 /*
6216  * call-seq:
6217  * str.codepoints {|integer| block } -> str
6218  * str.codepoints -> an_enumerator
6219  *
6220  * str.each_codepoint {|integer| block } -> str
6221  * str.each_codepoint -> an_enumerator
6222  *
6223  * Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
6224  * also known as a <i>codepoint</i> when applied to Unicode strings to the
6225  * given block.
6226  *
6227  * If no block is given, an enumerator is returned instead.
6228  *
6229  * "hello\u0639".each_codepoint {|c| print c, ' ' }
6230  *
6231  * <em>produces:</em>
6232  *
6233  * 104 101 108 108 111 1593
6234  */
6235 
6236 static VALUE
6238 {
6239  VALUE orig = str;
6240  int n;
6241  unsigned int c;
6242  const char *ptr, *end;
6243  rb_encoding *enc;
6244 
6245  if (single_byte_optimizable(str)) return rb_str_each_byte(str);
6246  RETURN_ENUMERATOR(str, 0, 0);
6247  str = rb_str_new4(str);
6248  ptr = RSTRING_PTR(str);
6249  end = RSTRING_END(str);
6250  enc = STR_ENC_GET(str);
6251  while (ptr < end) {
6252  c = rb_enc_codepoint_len(ptr, end, &n, enc);
6253  rb_yield(UINT2NUM(c));
6254  ptr += n;
6255  }
6256  return orig;
6257 }
6258 
6259 static long
6261 {
6262  rb_encoding *enc = STR_ENC_GET(str);
6263  const char *p, *p2, *beg, *end;
6264 
6265  beg = RSTRING_PTR(str);
6266  end = beg + RSTRING_LEN(str);
6267  if (beg > end) return 0;
6268  p = rb_enc_prev_char(beg, end, end, enc);
6269  if (!p) return 0;
6270  if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
6271  p2 = rb_enc_prev_char(beg, p, end, enc);
6272  if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
6273  }
6274  return p - beg;
6275 }
6276 
6277 /*
6278  * call-seq:
6279  * str.chop! -> str or nil
6280  *
6281  * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
6282  * or <code>nil</code> if <i>str</i> is the empty string. See also
6283  * <code>String#chomp!</code>.
6284  */
6285 
6286 static VALUE
6288 {
6289  str_modify_keep_cr(str);
6290  if (RSTRING_LEN(str) > 0) {
6291  long len;
6292  len = chopped_length(str);
6293  STR_SET_LEN(str, len);
6294  RSTRING_PTR(str)[len] = '\0';
6295  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6296  ENC_CODERANGE_CLEAR(str);
6297  }
6298  return str;
6299  }
6300  return Qnil;
6301 }
6302 
6303 
6304 /*
6305  * call-seq:
6306  * str.chop -> new_str
6307  *
6308  * Returns a new <code>String</code> with the last character removed. If the
6309  * string ends with <code>\r\n</code>, both characters are removed. Applying
6310  * <code>chop</code> to an empty string returns an empty
6311  * string. <code>String#chomp</code> is often a safer alternative, as it leaves
6312  * the string unchanged if it doesn't end in a record separator.
6313  *
6314  * "string\r\n".chop #=> "string"
6315  * "string\n\r".chop #=> "string\n"
6316  * "string\n".chop #=> "string"
6317  * "string".chop #=> "strin"
6318  * "x".chop.chop #=> ""
6319  */
6320 
6321 static VALUE
6323 {
6324  VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
6325  rb_enc_cr_str_copy_for_substr(str2, str);
6326  OBJ_INFECT(str2, str);
6327  return str2;
6328 }
6329 
6330 
6331 /*
6332  * call-seq:
6333  * str.chomp!(separator=$/) -> str or nil
6334  *
6335  * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
6336  * returning <i>str</i>, or <code>nil</code> if no modifications were made.
6337  */
6338 
6339 static VALUE
6341 {
6342  rb_encoding *enc;
6343  VALUE rs;
6344  int newline;
6345  char *p, *pp, *e;
6346  long len, rslen;
6347 
6348  str_modify_keep_cr(str);
6349  len = RSTRING_LEN(str);
6350  if (len == 0) return Qnil;
6351  p = RSTRING_PTR(str);
6352  e = p + len;
6353  if (argc == 0) {
6354  rs = rb_rs;
6355  if (rs == rb_default_rs) {
6356  smart_chomp:
6357  enc = rb_enc_get(str);
6358  if (rb_enc_mbminlen(enc) > 1) {
6359  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
6360  if (rb_enc_is_newline(pp, e, enc)) {
6361  e = pp;
6362  }
6363  pp = e - rb_enc_mbminlen(enc);
6364  if (pp >= p) {
6365  pp = rb_enc_left_char_head(p, pp, e, enc);
6366  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
6367  e = pp;
6368  }
6369  }
6370  if (e == RSTRING_END(str)) {
6371  return Qnil;
6372  }
6373  len = e - RSTRING_PTR(str);
6374  STR_SET_LEN(str, len);
6375  }
6376  else {
6377  if (RSTRING_PTR(str)[len-1] == '\n') {
6378  STR_DEC_LEN(str);
6379  if (RSTRING_LEN(str) > 0 &&
6380  RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
6381  STR_DEC_LEN(str);
6382  }
6383  }
6384  else if (RSTRING_PTR(str)[len-1] == '\r') {
6385  STR_DEC_LEN(str);
6386  }
6387  else {
6388  return Qnil;
6389  }
6390  }
6391  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6392  return str;
6393  }
6394  }
6395  else {
6396  rb_scan_args(argc, argv, "01", &rs);
6397  }
6398  if (NIL_P(rs)) return Qnil;
6399  StringValue(rs);
6400  rslen = RSTRING_LEN(rs);
6401  if (rslen == 0) {
6402  while (len>0 && p[len-1] == '\n') {
6403  len--;
6404  if (len>0 && p[len-1] == '\r')
6405  len--;
6406  }
6407  if (len < RSTRING_LEN(str)) {
6408  STR_SET_LEN(str, len);
6409  RSTRING_PTR(str)[len] = '\0';
6410  return str;
6411  }
6412  return Qnil;
6413  }
6414  if (rslen > len) return Qnil;
6415  newline = RSTRING_PTR(rs)[rslen-1];
6416  if (rslen == 1 && newline == '\n')
6417  goto smart_chomp;
6418 
6419  enc = rb_enc_check(str, rs);
6420  if (is_broken_string(rs)) {
6421  return Qnil;
6422  }
6423  pp = e - rslen;
6424  if (p[len-1] == newline &&
6425  (rslen <= 1 ||
6426  memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
6427  if (rb_enc_left_char_head(p, pp, e, enc) != pp)
6428  return Qnil;
6429  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6430  ENC_CODERANGE_CLEAR(str);
6431  }
6432  STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
6433  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6434  return str;
6435  }
6436  return Qnil;
6437 }
6438 
6439 
6440 /*
6441  * call-seq:
6442  * str.chomp(separator=$/) -> new_str
6443  *
6444  * Returns a new <code>String</code> with the given record separator removed
6445  * from the end of <i>str</i> (if present). If <code>$/</code> has not been
6446  * changed from the default Ruby record separator, then <code>chomp</code> also
6447  * removes carriage return characters (that is it will remove <code>\n</code>,
6448  * <code>\r</code>, and <code>\r\n</code>).
6449  *
6450  * "hello".chomp #=> "hello"
6451  * "hello\n".chomp #=> "hello"
6452  * "hello\r\n".chomp #=> "hello"
6453  * "hello\n\r".chomp #=> "hello\n"
6454  * "hello\r".chomp #=> "hello"
6455  * "hello \n there".chomp #=> "hello \n there"
6456  * "hello".chomp("llo") #=> "he"
6457  */
6458 
6459 static VALUE
6461 {
6462  str = rb_str_dup(str);
6463  rb_str_chomp_bang(argc, argv, str);
6464  return str;
6465 }
6466 
6467 /*
6468  * call-seq:
6469  * str.lstrip! -> self or nil
6470  *
6471  * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
6472  * change was made. See also <code>String#rstrip!</code> and
6473  * <code>String#strip!</code>.
6474  *
6475  * " hello ".lstrip #=> "hello "
6476  * "hello".lstrip! #=> nil
6477  */
6478 
6479 static VALUE
6481 {
6482  rb_encoding *enc;
6483  char *s, *t, *e;
6484 
6485  str_modify_keep_cr(str);
6486  enc = STR_ENC_GET(str);
6487  s = RSTRING_PTR(str);
6488  if (!s || RSTRING_LEN(str) == 0) return Qnil;
6489  e = t = RSTRING_END(str);
6490  /* remove spaces at head */
6491  while (s < e) {
6492  int n;
6493  unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
6494 
6495  if (!rb_isspace(cc)) break;
6496  s += n;
6497  }
6498 
6499  if (s > RSTRING_PTR(str)) {
6500  STR_SET_LEN(str, t-s);
6501  memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
6502  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6503  return str;
6504  }
6505  return Qnil;
6506 }
6507 
6508 
6509 /*
6510  * call-seq:
6511  * str.lstrip -> new_str
6512  *
6513  * Returns a copy of <i>str</i> with leading whitespace removed. See also
6514  * <code>String#rstrip</code> and <code>String#strip</code>.
6515  *
6516  * " hello ".lstrip #=> "hello "
6517  * "hello".lstrip #=> "hello"
6518  */
6519 
6520 static VALUE
6522 {
6523  str = rb_str_dup(str);
6524  rb_str_lstrip_bang(str);
6525  return str;
6526 }
6527 
6528 
6529 /*
6530  * call-seq:
6531  * str.rstrip! -> self or nil
6532  *
6533  * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
6534  * no change was made. See also <code>String#lstrip!</code> and
6535  * <code>String#strip!</code>.
6536  *
6537  * " hello ".rstrip #=> " hello"
6538  * "hello".rstrip! #=> nil
6539  */
6540 
6541 static VALUE
6543 {
6544  rb_encoding *enc;
6545  char *s, *t, *e;
6546 
6547  str_modify_keep_cr(str);
6548  enc = STR_ENC_GET(str);
6550  s = RSTRING_PTR(str);
6551  if (!s || RSTRING_LEN(str) == 0) return Qnil;
6552  t = e = RSTRING_END(str);
6553 
6554  /* remove trailing spaces or '\0's */
6555  if (single_byte_optimizable(str)) {
6556  unsigned char c;
6557  while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
6558  }
6559  else {
6560  char *tp;
6561 
6562  while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
6563  unsigned int c = rb_enc_codepoint(tp, e, enc);
6564  if (c && !rb_isspace(c)) break;
6565  t = tp;
6566  }
6567  }
6568  if (t < e) {
6569  long len = t-RSTRING_PTR(str);
6570 
6571  STR_SET_LEN(str, len);
6572  RSTRING_PTR(str)[len] = '\0';
6573  return str;
6574  }
6575  return Qnil;
6576 }
6577 
6578 
6579 /*
6580  * call-seq:
6581  * str.rstrip -> new_str
6582  *
6583  * Returns a copy of <i>str</i> with trailing whitespace removed. See also
6584  * <code>String#lstrip</code> and <code>String#strip</code>.
6585  *
6586  * " hello ".rstrip #=> " hello"
6587  * "hello".rstrip #=> "hello"
6588  */
6589 
6590 static VALUE
6592 {
6593  str = rb_str_dup(str);
6594  rb_str_rstrip_bang(str);
6595  return str;
6596 }
6597 
6598 
6599 /*
6600  * call-seq:
6601  * str.strip! -> str or nil
6602  *
6603  * Removes leading and trailing whitespace from <i>str</i>. Returns
6604  * <code>nil</code> if <i>str</i> was not altered.
6605  */
6606 
6607 static VALUE
6609 {
6610  VALUE l = rb_str_lstrip_bang(str);
6611  VALUE r = rb_str_rstrip_bang(str);
6612 
6613  if (NIL_P(l) && NIL_P(r)) return Qnil;
6614  return str;
6615 }
6616 
6617 
6618 /*
6619  * call-seq:
6620  * str.strip -> new_str
6621  *
6622  * Returns a copy of <i>str</i> with leading and trailing whitespace removed.
6623  *
6624  * " hello ".strip #=> "hello"
6625  * "\tgoodbye\r\n".strip #=> "goodbye"
6626  */
6627 
6628 static VALUE
6630 {
6631  str = rb_str_dup(str);
6632  rb_str_strip_bang(str);
6633  return str;
6634 }
6635 
6636 static VALUE
6637 scan_once(VALUE str, VALUE pat, long *start)
6638 {
6639  VALUE result, match;
6640  struct re_registers *regs;
6641  int i;
6642 
6643  if (rb_reg_search(pat, str, *start, 0) >= 0) {
6644  match = rb_backref_get();
6645  regs = RMATCH_REGS(match);
6646  if (BEG(0) == END(0)) {
6647  rb_encoding *enc = STR_ENC_GET(str);
6648  /*
6649  * Always consume at least one character of the input string
6650  */
6651  if (RSTRING_LEN(str) > END(0))
6652  *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
6653  RSTRING_END(str), enc);
6654  else
6655  *start = END(0)+1;
6656  }
6657  else {
6658  *start = END(0);
6659  }
6660  if (regs->num_regs == 1) {
6661  return rb_reg_nth_match(0, match);
6662  }
6663  result = rb_ary_new2(regs->num_regs);
6664  for (i=1; i < regs->num_regs; i++) {
6665  rb_ary_push(result, rb_reg_nth_match(i, match));
6666  }
6667 
6668  return result;
6669  }
6670  return Qnil;
6671 }
6672 
6673 
6674 /*
6675  * call-seq:
6676  * str.scan(pattern) -> array
6677  * str.scan(pattern) {|match, ...| block } -> str
6678  *
6679  * Both forms iterate through <i>str</i>, matching the pattern (which may be a
6680  * <code>Regexp</code> or a <code>String</code>). For each match, a result is
6681  * generated and either added to the result array or passed to the block. If
6682  * the pattern contains no groups, each individual result consists of the
6683  * matched string, <code>$&</code>. If the pattern contains groups, each
6684  * individual result is itself an array containing one entry per group.
6685  *
6686  * a = "cruel world"
6687  * a.scan(/\w+/) #=> ["cruel", "world"]
6688  * a.scan(/.../) #=> ["cru", "el ", "wor"]
6689  * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
6690  * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
6691  *
6692  * And the block form:
6693  *
6694  * a.scan(/\w+/) {|w| print "<<#{w}>> " }
6695  * print "\n"
6696  * a.scan(/(.)(.)/) {|x,y| print y, x }
6697  * print "\n"
6698  *
6699  * <em>produces:</em>
6700  *
6701  * <<cruel>> <<world>>
6702  * rceu lowlr
6703  */
6704 
6705 static VALUE
6707 {
6708  VALUE result;
6709  long start = 0;
6710  long last = -1, prev = 0;
6711  char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
6712 
6713  pat = get_pat(pat, 1);
6714  if (!rb_block_given_p()) {
6715  VALUE ary = rb_ary_new();
6716 
6717  while (!NIL_P(result = scan_once(str, pat, &start))) {
6718  last = prev;
6719  prev = start;
6720  rb_ary_push(ary, result);
6721  }
6722  if (last >= 0) rb_reg_search(pat, str, last, 0);
6723  return ary;
6724  }
6725 
6726  while (!NIL_P(result = scan_once(str, pat, &start))) {
6727  last = prev;
6728  prev = start;
6729  rb_yield(result);
6730  str_mod_check(str, p, len);
6731  }
6732  if (last >= 0) rb_reg_search(pat, str, last, 0);
6733  return str;
6734 }
6735 
6736 
6737 /*
6738  * call-seq:
6739  * str.hex -> integer
6740  *
6741  * Treats leading characters from <i>str</i> as a string of hexadecimal digits
6742  * (with an optional sign and an optional <code>0x</code>) and returns the
6743  * corresponding number. Zero is returned on error.
6744  *
6745  * "0x0a".hex #=> 10
6746  * "-1234".hex #=> -4660
6747  * "0".hex #=> 0
6748  * "wombat".hex #=> 0
6749  */
6750 
6751 static VALUE
6753 {
6754  rb_encoding *enc = rb_enc_get(str);
6755 
6756  if (!rb_enc_asciicompat(enc)) {
6757  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
6758  }
6759  return rb_str_to_inum(str, 16, FALSE);
6760 }
6761 
6762 
6763 /*
6764  * call-seq:
6765  * str.oct -> integer
6766  *
6767  * Treats leading characters of <i>str</i> as a string of octal digits (with an
6768  * optional sign) and returns the corresponding number. Returns 0 if the
6769  * conversion fails.
6770  *
6771  * "123".oct #=> 83
6772  * "-377".oct #=> -255
6773  * "bad".oct #=> 0
6774  * "0377bad".oct #=> 255
6775  */
6776 
6777 static VALUE
6779 {
6780  rb_encoding *enc = rb_enc_get(str);
6781 
6782  if (!rb_enc_asciicompat(enc)) {
6783  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
6784  }
6785  return rb_str_to_inum(str, -8, FALSE);
6786 }
6787 
6788 
6789 /*
6790  * call-seq:
6791  * str.crypt(other_str) -> new_str
6792  *
6793  * Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
6794  * library function <code>crypt</code>. The argument is the salt string, which
6795  * should be two characters long, each character drawn from
6796  * <code>[a-zA-Z0-9./]</code>.
6797  */
6798 
6799 static VALUE
6801 {
6802  extern char *crypt(const char *, const char *);
6803  VALUE result;
6804  const char *s, *saltp;
6805  char *res;
6806 #ifdef BROKEN_CRYPT
6807  char salt_8bit_clean[3];
6808 #endif
6809 
6810  StringValue(salt);
6811  if (RSTRING_LEN(salt) < 2)
6812  rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
6813 
6814  s = RSTRING_PTR(str);
6815  if (!s) s = "";
6816  saltp = RSTRING_PTR(salt);
6817 #ifdef BROKEN_CRYPT
6818  if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
6819  salt_8bit_clean[0] = saltp[0] & 0x7f;
6820  salt_8bit_clean[1] = saltp[1] & 0x7f;
6821  salt_8bit_clean[2] = '\0';
6822  saltp = salt_8bit_clean;
6823  }
6824 #endif
6825  res = crypt(s, saltp);
6826  if (!res) {
6827  rb_sys_fail("crypt");
6828  }
6829  result = rb_str_new2(res);
6830  OBJ_INFECT(result, str);
6831  OBJ_INFECT(result, salt);
6832  return result;
6833 }
6834 
6835 
6836 /*
6837  * call-seq:
6838  * str.intern -> symbol
6839  * str.to_sym -> symbol
6840  *
6841  * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
6842  * symbol if it did not previously exist. See <code>Symbol#id2name</code>.
6843  *
6844  * "Koala".intern #=> :Koala
6845  * s = 'cat'.to_sym #=> :cat
6846  * s == :cat #=> true
6847  * s = '@cat'.to_sym #=> :@cat
6848  * s == :@cat #=> true
6849  *
6850  * This can also be used to create symbols that cannot be represented using the
6851  * <code>:xxx</code> notation.
6852  *
6853  * 'cat and dog'.to_sym #=> :"cat and dog"
6854  */
6855 
6856 VALUE
6858 {
6859  VALUE str = RB_GC_GUARD(s);
6860  ID id;
6861 
6862  id = rb_intern_str(str);
6863  return ID2SYM(id);
6864 }
6865 
6866 
6867 /*
6868  * call-seq:
6869  * str.ord -> integer
6870  *
6871  * Return the <code>Integer</code> ordinal of a one-character string.
6872  *
6873  * "a".ord #=> 97
6874  */
6875 
6876 VALUE
6878 {
6879  unsigned int c;
6880 
6882  return UINT2NUM(c);
6883 }
6884 /*
6885  * call-seq:
6886  * str.sum(n=16) -> integer
6887  *
6888  * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
6889  * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
6890  * to 16. The result is simply the sum of the binary value of each character in
6891  * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
6892  * checksum.
6893  */
6894 
6895 static VALUE
6897 {
6898  VALUE vbits;
6899  int bits;
6900  char *ptr, *p, *pend;
6901  long len;
6902  VALUE sum = INT2FIX(0);
6903  unsigned long sum0 = 0;
6904 
6905  if (argc == 0) {
6906  bits = 16;
6907  }
6908  else {
6909  rb_scan_args(argc, argv, "01", &vbits);
6910  bits = NUM2INT(vbits);
6911  }
6912  ptr = p = RSTRING_PTR(str);
6913  len = RSTRING_LEN(str);
6914  pend = p + len;
6915 
6916  while (p < pend) {
6917  if (FIXNUM_MAX - UCHAR_MAX < sum0) {
6918  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6919  str_mod_check(str, ptr, len);
6920  sum0 = 0;
6921  }
6922  sum0 += (unsigned char)*p;
6923  p++;
6924  }
6925 
6926  if (bits == 0) {
6927  if (sum0) {
6928  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6929  }
6930  }
6931  else {
6932  if (sum == INT2FIX(0)) {
6933  if (bits < (int)sizeof(long)*CHAR_BIT) {
6934  sum0 &= (((unsigned long)1)<<bits)-1;
6935  }
6936  sum = LONG2FIX(sum0);
6937  }
6938  else {
6939  VALUE mod;
6940 
6941  if (sum0) {
6942  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6943  }
6944 
6945  mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
6946  mod = rb_funcall(mod, '-', 1, INT2FIX(1));
6947  sum = rb_funcall(sum, '&', 1, mod);
6948  }
6949  }
6950  return sum;
6951 }
6952 
6953 static VALUE
6954 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
6955 {
6956  rb_encoding *enc;
6957  VALUE w;
6958  long width, len, flen = 1, fclen = 1;
6959  VALUE res;
6960  char *p;
6961  const char *f = " ";
6962  long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
6963  volatile VALUE pad;
6964  int singlebyte = 1, cr;
6965 
6966  rb_scan_args(argc, argv, "11", &w, &pad);
6967  enc = STR_ENC_GET(str);
6968  width = NUM2LONG(w);
6969  if (argc == 2) {
6970  StringValue(pad);
6971  enc = rb_enc_check(str, pad);
6972  f = RSTRING_PTR(pad);
6973  flen = RSTRING_LEN(pad);
6974  fclen = str_strlen(pad, enc);
6975  singlebyte = single_byte_optimizable(pad);
6976  if (flen == 0 || fclen == 0) {
6977  rb_raise(rb_eArgError, "zero width padding");
6978  }
6979  }
6980  len = str_strlen(str, enc);
6981  if (width < 0 || len >= width) return rb_str_dup(str);
6982  n = width - len;
6983  llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
6984  rlen = n - llen;
6985  cr = ENC_CODERANGE(str);
6986  if (flen > 1) {
6987  llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
6988  rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
6989  }
6990  size = RSTRING_LEN(str);
6991  if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
6992  (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
6993  (len += llen2 + rlen2) >= LONG_MAX - size) {
6994  rb_raise(rb_eArgError, "argument too big");
6995  }
6996  len += size;
6997  res = rb_str_new5(str, 0, len);
6998  p = RSTRING_PTR(res);
6999  if (flen <= 1) {
7000  memset(p, *f, llen);
7001  p += llen;
7002  }
7003  else {
7004  while (llen >= fclen) {
7005  memcpy(p,f,flen);
7006  p += flen;
7007  llen -= fclen;
7008  }
7009  if (llen > 0) {
7010  memcpy(p, f, llen2);
7011  p += llen2;
7012  }
7013  }
7014  memcpy(p, RSTRING_PTR(str), size);
7015  p += size;
7016  if (flen <= 1) {
7017  memset(p, *f, rlen);
7018  p += rlen;
7019  }
7020  else {
7021  while (rlen >= fclen) {
7022  memcpy(p,f,flen);
7023  p += flen;
7024  rlen -= fclen;
7025  }
7026  if (rlen > 0) {
7027  memcpy(p, f, rlen2);
7028  p += rlen2;
7029  }
7030  }
7031  *p = '\0';
7032  STR_SET_LEN(res, p-RSTRING_PTR(res));
7033  OBJ_INFECT(res, str);
7034  if (!NIL_P(pad)) OBJ_INFECT(res, pad);
7035  rb_enc_associate(res, enc);
7036  if (argc == 2)
7037  cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
7038  if (cr != ENC_CODERANGE_BROKEN)
7039  ENC_CODERANGE_SET(res, cr);
7040  return res;
7041 }
7042 
7043 
7044 /*
7045  * call-seq:
7046  * str.ljust(integer, padstr=' ') -> new_str
7047  *
7048  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7049  * <code>String</code> of length <i>integer</i> with <i>str</i> left justified
7050  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7051  *
7052  * "hello".ljust(4) #=> "hello"
7053  * "hello".ljust(20) #=> "hello "
7054  * "hello".ljust(20, '1234') #=> "hello123412341234123"
7055  */
7056 
7057 static VALUE
7059 {
7060  return rb_str_justify(argc, argv, str, 'l');
7061 }
7062 
7063 
7064 /*
7065  * call-seq:
7066  * str.rjust(integer, padstr=' ') -> new_str
7067  *
7068  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7069  * <code>String</code> of length <i>integer</i> with <i>str</i> right justified
7070  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7071  *
7072  * "hello".rjust(4) #=> "hello"
7073  * "hello".rjust(20) #=> " hello"
7074  * "hello".rjust(20, '1234') #=> "123412341234123hello"
7075  */
7076 
7077 static VALUE
7079 {
7080  return rb_str_justify(argc, argv, str, 'r');
7081 }
7082 
7083 
7084 /*
7085  * call-seq:
7086  * str.center(integer, padstr) -> new_str
7087  *
7088  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7089  * <code>String</code> of length <i>integer</i> with <i>str</i> centered and
7090  * padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7091  *
7092  * "hello".center(4) #=> "hello"
7093  * "hello".center(20) #=> " hello "
7094  * "hello".center(20, '123') #=> "1231231hello12312312"
7095  */
7096 
7097 static VALUE
7099 {
7100  return rb_str_justify(argc, argv, str, 'c');
7101 }
7102 
7103 /*
7104  * call-seq:
7105  * str.partition(sep) -> [head, sep, tail]
7106  * str.partition(regexp) -> [head, match, tail]
7107  *
7108  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
7109  * and returns the part before it, the match, and the part
7110  * after it.
7111  * If it is not found, returns two empty strings and <i>str</i>.
7112  *
7113  * "hello".partition("l") #=> ["he", "l", "lo"]
7114  * "hello".partition("x") #=> ["hello", "", ""]
7115  * "hello".partition(/.l/) #=> ["h", "el", "lo"]
7116  */
7117 
7118 static VALUE
7120 {
7121  long pos;
7122  int regex = FALSE;
7123 
7124  if (TYPE(sep) == T_REGEXP) {
7125  pos = rb_reg_search(sep, str, 0, 0);
7126  regex = TRUE;
7127  }
7128  else {
7129  VALUE tmp;
7130 
7131  tmp = rb_check_string_type(sep);
7132  if (NIL_P(tmp)) {
7133  rb_raise(rb_eTypeError, "type mismatch: %s given",
7134  rb_obj_classname(sep));
7135  }
7136  sep = tmp;
7137  pos = rb_str_index(str, sep, 0);
7138  }
7139  if (pos < 0) {
7140  failed:
7141  return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
7142  }
7143  if (regex) {
7144  sep = rb_str_subpat(str, sep, INT2FIX(0));
7145  if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
7146  }
7147  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
7148  sep,
7149  rb_str_subseq(str, pos+RSTRING_LEN(sep),
7150  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
7151 }
7152 
7153 /*
7154  * call-seq:
7155  * str.rpartition(sep) -> [head, sep, tail]
7156  * str.rpartition(regexp) -> [head, match, tail]
7157  *
7158  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
7159  * of the string, and returns the part before it, the match, and the part
7160  * after it.
7161  * If it is not found, returns two empty strings and <i>str</i>.
7162  *
7163  * "hello".rpartition("l") #=> ["hel", "l", "o"]
7164  * "hello".rpartition("x") #=> ["", "", "hello"]
7165  * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
7166  */
7167 
7168 static VALUE
7170 {
7171  long pos = RSTRING_LEN(str);
7172  int regex = FALSE;
7173 
7174  if (TYPE(sep) == T_REGEXP) {
7175  pos = rb_reg_search(sep, str, pos, 1);
7176  regex = TRUE;
7177  }
7178  else {
7179  VALUE tmp;
7180 
7181  tmp = rb_check_string_type(sep);
7182  if (NIL_P(tmp)) {
7183  rb_raise(rb_eTypeError, "type mismatch: %s given",
7184  rb_obj_classname(sep));
7185  }
7186  sep = tmp;
7187  pos = rb_str_sublen(str, pos);
7188  pos = rb_str_rindex(str, sep, pos);
7189  }
7190  if (pos < 0) {
7191  return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
7192  }
7193  if (regex) {
7194  sep = rb_reg_nth_match(0, rb_backref_get());
7195  }
7196  return rb_ary_new3(3, rb_str_substr(str, 0, pos),
7197  sep,
7198  rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
7199 }
7200 
7201 /*
7202  * call-seq:
7203  * str.start_with?([prefix]+) -> true or false
7204  *
7205  * Returns true if <i>str</i> starts with one of the prefixes given.
7206  *
7207  * p "hello".start_with?("hell") #=> true
7208  *
7209  * # returns true if one of the prefixes matches.
7210  * p "hello".start_with?("heaven", "hell") #=> true
7211  * p "hello".start_with?("heaven", "paradise") #=> false
7212  *
7213  *
7214  *
7215  */
7216 
7217 static VALUE
7219 {
7220  int i;
7221 
7222  for (i=0; i<argc; i++) {
7223  VALUE tmp = rb_check_string_type(argv[i]);
7224  if (NIL_P(tmp)) continue;
7225  rb_enc_check(str, tmp);
7226  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
7227  if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
7228  return Qtrue;
7229  }
7230  return Qfalse;
7231 }
7232 
7233 /*
7234  * call-seq:
7235  * str.end_with?([suffix]+) -> true or false
7236  *
7237  * Returns true if <i>str</i> ends with one of the suffixes given.
7238  */
7239 
7240 static VALUE
7242 {
7243  int i;
7244  char *p, *s, *e;
7245  rb_encoding *enc;
7246 
7247  for (i=0; i<argc; i++) {
7248  VALUE tmp = rb_check_string_type(argv[i]);
7249  if (NIL_P(tmp)) continue;
7250  enc = rb_enc_check(str, tmp);
7251  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
7252  p = RSTRING_PTR(str);
7253  e = p + RSTRING_LEN(str);
7254  s = e - RSTRING_LEN(tmp);
7255  if (rb_enc_left_char_head(p, s, e, enc) != s)
7256  continue;
7257  if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
7258  return Qtrue;
7259  }
7260  return Qfalse;
7261 }
7262 
7263 void
7264 rb_str_setter(VALUE val, ID id, VALUE *var)
7265 {
7266  if (!NIL_P(val) && TYPE(val) != T_STRING) {
7267  rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
7268  }
7269  *var = val;
7270 }
7271 
7272 
7273 /*
7274  * call-seq:
7275  * str.force_encoding(encoding) -> str
7276  *
7277  * Changes the encoding to +encoding+ and returns self.
7278  */
7279 
7280 static VALUE
7282 {
7283  str_modifiable(str);
7284  rb_enc_associate(str, rb_to_encoding(enc));
7285  ENC_CODERANGE_CLEAR(str);
7286  return str;
7287 }
7288 
7289 /*
7290  * call-seq:
7291  * str.valid_encoding? -> true or false
7292  *
7293  * Returns true for a string which encoded correctly.
7294  *
7295  * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
7296  * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
7297  * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
7298  */
7299 
7300 static VALUE
7302 {
7303  int cr = rb_enc_str_coderange(str);
7304 
7305  return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
7306 }
7307 
7308 /*
7309  * call-seq:
7310  * str.ascii_only? -> true or false
7311  *
7312  * Returns true for a string which has only ASCII characters.
7313  *
7314  * "abc".force_encoding("UTF-8").ascii_only? #=> true
7315  * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
7316  */
7317 
7318 static VALUE
7320 {
7321  int cr = rb_enc_str_coderange(str);
7322 
7323  return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
7324 }
7325 
7340 VALUE
7341 rb_str_ellipsize(VALUE str, long len)
7342 {
7343  static const char ellipsis[] = "...";
7344  const long ellipsislen = sizeof(ellipsis) - 1;
7345  rb_encoding *const enc = rb_enc_get(str);
7346  const long blen = RSTRING_LEN(str);
7347  const char *const p = RSTRING_PTR(str), *e = p + blen;
7348  VALUE estr, ret = 0;
7349 
7350  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
7351  if (len * rb_enc_mbminlen(enc) >= blen ||
7352  (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
7353  ret = str;
7354  }
7355  else if (len <= ellipsislen ||
7356  !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
7357  if (rb_enc_asciicompat(enc)) {
7358  ret = rb_str_new_with_class(str, ellipsis, len);
7359  rb_enc_associate(ret, enc);
7360  }
7361  else {
7362  estr = rb_usascii_str_new(ellipsis, len);
7363  ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
7364  }
7365  }
7366  else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
7367  rb_str_cat(ret, ellipsis, ellipsislen);
7368  }
7369  else {
7370  estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
7371  rb_enc_from_encoding(enc), 0, Qnil);
7372  rb_str_append(ret, estr);
7373  }
7374  return ret;
7375 }
7376 
7377 /**********************************************************************
7378  * Document-class: Symbol
7379  *
7380  * <code>Symbol</code> objects represent names and some strings
7381  * inside the Ruby
7382  * interpreter. They are generated using the <code>:name</code> and
7383  * <code>:"string"</code> literals
7384  * syntax, and by the various <code>to_sym</code> methods. The same
7385  * <code>Symbol</code> object will be created for a given name or string
7386  * for the duration of a program's execution, regardless of the context
7387  * or meaning of that name. Thus if <code>Fred</code> is a constant in
7388  * one context, a method in another, and a class in a third, the
7389  * <code>Symbol</code> <code>:Fred</code> will be the same object in
7390  * all three contexts.
7391  *
7392  * module One
7393  * class Fred
7394  * end
7395  * $f1 = :Fred
7396  * end
7397  * module Two
7398  * Fred = 1
7399  * $f2 = :Fred
7400  * end
7401  * def Fred()
7402  * end
7403  * $f3 = :Fred
7404  * $f1.object_id #=> 2514190
7405  * $f2.object_id #=> 2514190
7406  * $f3.object_id #=> 2514190
7407  *
7408  */
7409 
7410 
7411 /*
7412  * call-seq:
7413  * sym == obj -> true or false
7414  *
7415  * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
7416  * symbol, returns <code>true</code>.
7417  */
7418 
7419 static VALUE
7420 sym_equal(VALUE sym1, VALUE sym2)
7421 {
7422  if (sym1 == sym2) return Qtrue;
7423  return Qfalse;
7424 }
7425 
7426 
7427 static int
7428 sym_printable(const char *s, const char *send, rb_encoding *enc)
7429 {
7430  while (s < send) {
7431  int n;
7432  int c = rb_enc_codepoint_len(s, send, &n, enc);
7433 
7434  if (!rb_enc_isprint(c, enc)) return FALSE;
7435  s += n;
7436  }
7437  return TRUE;
7438 }
7439 
7440 /*
7441  * call-seq:
7442  * sym.inspect -> string
7443  *
7444  * Returns the representation of <i>sym</i> as a symbol literal.
7445  *
7446  * :fred.inspect #=> ":fred"
7447  */
7448 
7449 static VALUE
7451 {
7452  VALUE str;
7453  ID id = SYM2ID(sym);
7454  rb_encoding *enc;
7455  const char *ptr;
7456  long len;
7457  char *dest;
7459 
7460  if (resenc == NULL) resenc = rb_default_external_encoding();
7461  sym = rb_id2str(id);
7462  enc = STR_ENC_GET(sym);
7463  ptr = RSTRING_PTR(sym);
7464  len = RSTRING_LEN(sym);
7465  if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
7466  !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
7467  str = rb_str_inspect(sym);
7468  len = RSTRING_LEN(str);
7469  rb_str_resize(str, len + 1);
7470  dest = RSTRING_PTR(str);
7471  memmove(dest + 1, dest, len);
7472  dest[0] = ':';
7473  }
7474  else {
7475  char *dest;
7476  str = rb_enc_str_new(0, len + 1, enc);
7477  dest = RSTRING_PTR(str);
7478  dest[0] = ':';
7479  memcpy(dest + 1, ptr, len);
7480  }
7481  return str;
7482 }
7483 
7484 
7485 /*
7486  * call-seq:
7487  * sym.id2name -> string
7488  * sym.to_s -> string
7489  *
7490  * Returns the name or string corresponding to <i>sym</i>.
7491  *
7492  * :fred.id2name #=> "fred"
7493  */
7494 
7495 
7496 VALUE
7498 {
7499  ID id = SYM2ID(sym);
7500 
7501  return str_new3(rb_cString, rb_id2str(id));
7502 }
7503 
7504 
7505 /*
7506  * call-seq:
7507  * sym.to_sym -> sym
7508  * sym.intern -> sym
7509  *
7510  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
7511  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
7512  * in this case.
7513  */
7514 
7515 static VALUE
7517 {
7518  return sym;
7519 }
7520 
7521 static VALUE
7522 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc)
7523 {
7524  VALUE obj;
7525 
7526  if (argc < 1) {
7527  rb_raise(rb_eArgError, "no receiver given");
7528  }
7529  obj = argv[0];
7530  return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc);
7531 }
7532 
7533 /*
7534  * call-seq:
7535  * sym.to_proc
7536  *
7537  * Returns a _Proc_ object which respond to the given method by _sym_.
7538  *
7539  * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
7540  */
7541 
7542 static VALUE
7544 {
7545  static VALUE sym_proc_cache = Qfalse;
7546  enum {SYM_PROC_CACHE_SIZE = 67};
7547  VALUE proc;
7548  long id, index;
7549  VALUE *aryp;
7550 
7551  if (!sym_proc_cache) {
7552  sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
7553  rb_gc_register_mark_object(sym_proc_cache);
7554  rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
7555  }
7556 
7557  id = SYM2ID(sym);
7558  index = (id % SYM_PROC_CACHE_SIZE) << 1;
7559 
7560  aryp = RARRAY_PTR(sym_proc_cache);
7561  if (aryp[index] == sym) {
7562  return aryp[index + 1];
7563  }
7564  else {
7565  proc = rb_proc_new(sym_call, (VALUE)id);
7566  aryp[index] = sym;
7567  aryp[index + 1] = proc;
7568  return proc;
7569  }
7570 }
7571 
7572 /*
7573  * call-seq:
7574  *
7575  * sym.succ
7576  *
7577  * Same as <code>sym.to_s.succ.intern</code>.
7578  */
7579 
7580 static VALUE
7582 {
7583  return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
7584 }
7585 
7586 /*
7587  * call-seq:
7588  *
7589  * str <=> other -> -1, 0, +1 or nil
7590  *
7591  * Compares _sym_ with _other_ in string form.
7592  */
7593 
7594 static VALUE
7596 {
7597  if (!SYMBOL_P(other)) {
7598  return Qnil;
7599  }
7600  return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
7601 }
7602 
7603 /*
7604  * call-seq:
7605  *
7606  * sym.casecmp(other) -> -1, 0, +1 or nil
7607  *
7608  * Case-insensitive version of <code>Symbol#<=></code>.
7609  */
7610 
7611 static VALUE
7613 {
7614  if (!SYMBOL_P(other)) {
7615  return Qnil;
7616  }
7617  return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
7618 }
7619 
7620 /*
7621  * call-seq:
7622  * sym =~ obj -> fixnum or nil
7623  *
7624  * Returns <code>sym.to_s =~ obj</code>.
7625  */
7626 
7627 static VALUE
7629 {
7630  return rb_str_match(rb_sym_to_s(sym), other);
7631 }
7632 
7633 /*
7634  * call-seq:
7635  * sym[idx] -> char
7636  * sym[b, n] -> char
7637  *
7638  * Returns <code>sym.to_s[]</code>.
7639  */
7640 
7641 static VALUE
7643 {
7644  return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
7645 }
7646 
7647 /*
7648  * call-seq:
7649  * sym.length -> integer
7650  *
7651  * Same as <code>sym.to_s.length</code>.
7652  */
7653 
7654 static VALUE
7656 {
7657  return rb_str_length(rb_id2str(SYM2ID(sym)));
7658 }
7659 
7660 /*
7661  * call-seq:
7662  * sym.empty? -> true or false
7663  *
7664  * Returns that _sym_ is :"" or not.
7665  */
7666 
7667 static VALUE
7669 {
7670  return rb_str_empty(rb_id2str(SYM2ID(sym)));
7671 }
7672 
7673 /*
7674  * call-seq:
7675  * sym.upcase -> symbol
7676  *
7677  * Same as <code>sym.to_s.upcase.intern</code>.
7678  */
7679 
7680 static VALUE
7682 {
7684 }
7685 
7686 /*
7687  * call-seq:
7688  * sym.downcase -> symbol
7689  *
7690  * Same as <code>sym.to_s.downcase.intern</code>.
7691  */
7692 
7693 static VALUE
7695 {
7697 }
7698 
7699 /*
7700  * call-seq:
7701  * sym.capitalize -> symbol
7702  *
7703  * Same as <code>sym.to_s.capitalize.intern</code>.
7704  */
7705 
7706 static VALUE
7708 {
7710 }
7711 
7712 /*
7713  * call-seq:
7714  * sym.swapcase -> symbol
7715  *
7716  * Same as <code>sym.to_s.swapcase.intern</code>.
7717  */
7718 
7719 static VALUE
7721 {
7723 }
7724 
7725 /*
7726  * call-seq:
7727  * sym.encoding -> encoding
7728  *
7729  * Returns the Encoding object that represents the encoding of _sym_.
7730  */
7731 
7732 static VALUE
7734 {
7735  return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
7736 }
7737 
7738 ID
7740 {
7741  VALUE tmp;
7742 
7743  switch (TYPE(name)) {
7744  default:
7745  tmp = rb_check_string_type(name);
7746  if (NIL_P(tmp)) {
7747  tmp = rb_inspect(name);
7748  rb_raise(rb_eTypeError, "%s is not a symbol",
7749  RSTRING_PTR(tmp));
7750  }
7751  name = tmp;
7752  /* fall through */
7753  case T_STRING:
7754  name = rb_str_intern(name);
7755  /* fall through */
7756  case T_SYMBOL:
7757  return SYM2ID(name);
7758  }
7759  return Qnil; /* not reached */
7760 }
7761 
7762 /*
7763  * A <code>String</code> object holds and manipulates an arbitrary sequence of
7764  * bytes, typically representing characters. String objects may be created
7765  * using <code>String::new</code> or as literals.
7766  *
7767  * Because of aliasing issues, users of strings should be aware of the methods
7768  * that modify the contents of a <code>String</code> object. Typically,
7769  * methods with names ending in ``!'' modify their receiver, while those
7770  * without a ``!'' return a new <code>String</code>. However, there are
7771  * exceptions, such as <code>String#[]=</code>.
7772  *
7773  */
7774 
7775 void
7777 {
7778 #undef rb_intern
7779 #define rb_intern(str) rb_intern_const(str)
7780 
7781  rb_cString = rb_define_class("String", rb_cObject);
7785  rb_define_method(rb_cString, "initialize", rb_str_init, -1);
7786  rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
7790  rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
7792  rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
7798  rb_define_method(rb_cString, "insert", rb_str_insert, 2);
7799  rb_define_method(rb_cString, "length", rb_str_length, 0);
7801  rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
7802  rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
7809  rb_define_method(rb_cString, "upto", rb_str_upto, -1);
7812  rb_define_method(rb_cString, "replace", rb_str_replace, 1);
7815  rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
7816  rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
7817  rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
7818 
7819  rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
7822  rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
7823  rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
7825 
7826  rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
7827  rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
7828  rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
7829  rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
7830 
7835 
7843  rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
7845  rb_define_method(rb_cString, "concat", rb_str_concat, 1);
7847  rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
7849  rb_define_method(rb_cString, "intern", rb_str_intern, 0);
7850  rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
7852 
7853  rb_define_method(rb_cString, "include?", rb_str_include, 1);
7854  rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
7855  rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
7856 
7858 
7859  rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
7860  rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
7861  rb_define_method(rb_cString, "center", rb_str_center, -1);
7862 
7863  rb_define_method(rb_cString, "sub", rb_str_sub, -1);
7864  rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
7866  rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
7868  rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
7869  rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
7870 
7878 
7881  rb_define_method(rb_cString, "delete", rb_str_delete, -1);
7882  rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
7883  rb_define_method(rb_cString, "count", rb_str_count, -1);
7884 
7889 
7890  rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
7891  rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
7892  rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
7893  rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
7894 
7895  rb_define_method(rb_cString, "sum", rb_str_sum, -1);
7896 
7897  rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
7899 
7900  rb_define_method(rb_cString, "partition", rb_str_partition, 1);
7901  rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
7902 
7903  rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
7904  rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
7905  rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
7907 
7908  id_to_s = rb_intern("to_s");
7909 
7910  rb_fs = Qnil;
7911  rb_define_variable("$;", &rb_fs);
7912  rb_define_variable("$-F", &rb_fs);
7913 
7914  rb_cSymbol = rb_define_class("Symbol", rb_cObject);
7918  rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
7919 
7922  rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
7924  rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
7925  rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
7926  rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
7927  rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
7928  rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
7929  rb_define_method(rb_cSymbol, "next", sym_succ, 0);
7930 
7931  rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
7932  rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
7934 
7935  rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
7936  rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
7937  rb_define_method(rb_cSymbol, "length", sym_length, 0);
7938  rb_define_method(rb_cSymbol, "size", sym_length, 0);
7939  rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
7940  rb_define_method(rb_cSymbol, "match", sym_match, 1);
7941 
7942  rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
7943  rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
7944  rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
7945  rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
7946 
7947  rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
7948 }
7949