Ruby  1.9.3p551(2014-11-13revision48407)
string.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  string.c -
4 
5  $Author: usa $
6  created at: Mon Aug 9 17:12:58 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "ruby/ruby.h"
15 #include "ruby/re.h"
16 #include "ruby/encoding.h"
17 #include "internal.h"
18 #include <assert.h>
19 
20 #define BEG(no) (regs->beg[(no)])
21 #define END(no) (regs->end[(no)])
22 
23 #include <math.h>
24 #include <ctype.h>
25 
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 
30 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
31 
32 #undef rb_str_new_cstr
33 #undef rb_tainted_str_new_cstr
34 #undef rb_usascii_str_new_cstr
35 #undef rb_external_str_new_cstr
36 #undef rb_locale_str_new_cstr
37 #undef rb_str_new2
38 #undef rb_str_new3
39 #undef rb_str_new4
40 #undef rb_str_new5
41 #undef rb_tainted_str_new2
42 #undef rb_usascii_str_new2
43 #undef rb_str_dup_frozen
44 #undef rb_str_buf_new_cstr
45 #undef rb_str_buf_new2
46 #undef rb_str_buf_cat2
47 #undef rb_str_cat2
48 
49 static VALUE rb_str_clear(VALUE str);
50 
53 
54 #define RUBY_MAX_CHAR_LEN 16
55 #define STR_TMPLOCK FL_USER7
56 #define STR_NOEMBED FL_USER1
57 #define STR_SHARED FL_USER2 /* = ELTS_SHARED */
58 #define STR_ASSOC FL_USER3
59 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
60 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
61 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
62 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
63 #define STR_UNSET_NOCAPA(s) do {\
64  if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
65 } while (0)
66 
67 
68 #define STR_SET_NOEMBED(str) do {\
69  FL_SET((str), STR_NOEMBED);\
70  STR_SET_EMBED_LEN((str), 0);\
71 } while (0)
72 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
73 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
74 #define STR_SET_EMBED_LEN(str, n) do { \
75  long tmp_n = (n);\
76  RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
77  RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
78 } while (0)
79 
80 #define STR_SET_LEN(str, n) do { \
81  if (STR_EMBED_P(str)) {\
82  STR_SET_EMBED_LEN((str), (n));\
83  }\
84  else {\
85  RSTRING(str)->as.heap.len = (n);\
86  }\
87 } while (0)
88 
89 #define STR_DEC_LEN(str) do {\
90  if (STR_EMBED_P(str)) {\
91  long n = RSTRING_LEN(str);\
92  n--;\
93  STR_SET_EMBED_LEN((str), n);\
94  }\
95  else {\
96  RSTRING(str)->as.heap.len--;\
97  }\
98 } while (0)
99 
100 #define RESIZE_CAPA(str,capacity) do {\
101  if (STR_EMBED_P(str)) {\
102  if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
103  char *tmp = ALLOC_N(char, (capacity)+1);\
104  memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
105  RSTRING(str)->as.heap.ptr = tmp;\
106  RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
107  STR_SET_NOEMBED(str);\
108  RSTRING(str)->as.heap.aux.capa = (capacity);\
109  }\
110  }\
111  else {\
112  REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
113  if (!STR_NOCAPA_P(str))\
114  RSTRING(str)->as.heap.aux.capa = (capacity);\
115  }\
116 } while (0)
117 
118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
120 
121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
122 
123 static inline int
125 {
126  rb_encoding *enc;
127 
128  /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
129  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
130  return 1;
131 
132  enc = STR_ENC_GET(str);
133  if (rb_enc_mbmaxlen(enc) == 1)
134  return 1;
135 
136  /* Conservative. Possibly single byte.
137  * "\xa1" in Shift_JIS for example. */
138  return 0;
139 }
140 
142 
143 static inline const char *
144 search_nonascii(const char *p, const char *e)
145 {
146 #if SIZEOF_VALUE == 8
147 # define NONASCII_MASK 0x8080808080808080ULL
148 #elif SIZEOF_VALUE == 4
149 # define NONASCII_MASK 0x80808080UL
150 #endif
151 #ifdef NONASCII_MASK
152  if ((int)sizeof(VALUE) * 2 < e - p) {
153  const VALUE *s, *t;
154  const VALUE lowbits = sizeof(VALUE) - 1;
155  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
156  while (p < (const char *)s) {
157  if (!ISASCII(*p))
158  return p;
159  p++;
160  }
161  t = (const VALUE*)(~lowbits & (VALUE)e);
162  while (s < t) {
163  if (*s & NONASCII_MASK) {
164  t = s;
165  break;
166  }
167  s++;
168  }
169  p = (const char *)t;
170  }
171 #endif
172  while (p < e) {
173  if (!ISASCII(*p))
174  return p;
175  p++;
176  }
177  return NULL;
178 }
179 
180 static int
181 coderange_scan(const char *p, long len, rb_encoding *enc)
182 {
183  const char *e = p + len;
184 
185  if (rb_enc_to_index(enc) == 0) {
186  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
187  p = search_nonascii(p, e);
189  }
190 
191  if (rb_enc_asciicompat(enc)) {
192  p = search_nonascii(p, e);
193  if (!p) {
194  return ENC_CODERANGE_7BIT;
195  }
196  while (p < e) {
197  int ret = rb_enc_precise_mbclen(p, e, enc);
198  if (!MBCLEN_CHARFOUND_P(ret)) {
199  return ENC_CODERANGE_BROKEN;
200  }
201  p += MBCLEN_CHARFOUND_LEN(ret);
202  if (p < e) {
203  p = search_nonascii(p, e);
204  if (!p) {
205  return ENC_CODERANGE_VALID;
206  }
207  }
208  }
209  if (e < p) {
210  return ENC_CODERANGE_BROKEN;
211  }
212  return ENC_CODERANGE_VALID;
213  }
214 
215  while (p < e) {
216  int ret = rb_enc_precise_mbclen(p, e, enc);
217 
218  if (!MBCLEN_CHARFOUND_P(ret)) {
219  return ENC_CODERANGE_BROKEN;
220  }
221  p += MBCLEN_CHARFOUND_LEN(ret);
222  }
223  if (e < p) {
224  return ENC_CODERANGE_BROKEN;
225  }
226  return ENC_CODERANGE_VALID;
227 }
228 
229 long
230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
231 {
232  const char *p = s;
233 
234  if (*cr == ENC_CODERANGE_BROKEN)
235  return e - s;
236 
237  if (rb_enc_to_index(enc) == 0) {
238  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
239  p = search_nonascii(p, e);
241  return e - s;
242  }
243  else if (rb_enc_asciicompat(enc)) {
244  p = search_nonascii(p, e);
245  if (!p) {
246  if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
247  return e - s;
248  }
249  while (p < e) {
250  int ret = rb_enc_precise_mbclen(p, e, enc);
251  if (!MBCLEN_CHARFOUND_P(ret)) {
253  return p - s;
254  }
255  p += MBCLEN_CHARFOUND_LEN(ret);
256  if (p < e) {
257  p = search_nonascii(p, e);
258  if (!p) {
259  *cr = ENC_CODERANGE_VALID;
260  return e - s;
261  }
262  }
263  }
265  return p - s;
266  }
267  else {
268  while (p < e) {
269  int ret = rb_enc_precise_mbclen(p, e, enc);
270  if (!MBCLEN_CHARFOUND_P(ret)) {
272  return p - s;
273  }
274  p += MBCLEN_CHARFOUND_LEN(ret);
275  }
277  return p - s;
278  }
279 }
280 
281 static inline void
283 {
284  rb_enc_set_index(str1, ENCODING_GET(str2));
285 }
286 
287 static void
289 {
290  /* this function is designed for copying encoding and coderange
291  * from src to new string "dest" which is made from the part of src.
292  */
293  str_enc_copy(dest, src);
294  switch (ENC_CODERANGE(src)) {
295  case ENC_CODERANGE_7BIT:
297  break;
298  case ENC_CODERANGE_VALID:
299  if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
302  else
304  break;
305  default:
306  if (RSTRING_LEN(dest) == 0) {
307  if (!rb_enc_asciicompat(STR_ENC_GET(src)))
309  else
311  }
312  break;
313  }
314 }
315 
316 static void
318 {
319  str_enc_copy(dest, src);
320  ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
321 }
322 
323 int
325 {
326  int cr = ENC_CODERANGE(str);
327 
328  if (cr == ENC_CODERANGE_UNKNOWN) {
329  rb_encoding *enc = STR_ENC_GET(str);
330  cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
331  ENC_CODERANGE_SET(str, cr);
332  }
333  return cr;
334 }
335 
336 int
338 {
339  rb_encoding *enc = STR_ENC_GET(str);
340 
341  if (!rb_enc_asciicompat(enc))
342  return FALSE;
343  else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
344  return TRUE;
345  return FALSE;
346 }
347 
348 static inline void
349 str_mod_check(VALUE s, const char *p, long len)
350 {
351  if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
352  rb_raise(rb_eRuntimeError, "string modified");
353  }
354 }
355 
356 size_t
358 {
359  if (STR_EMBED_P(str)) {
360  return RSTRING_EMBED_LEN_MAX;
361  }
362  else if (STR_NOCAPA_P(str)) {
363  return RSTRING(str)->as.heap.len;
364  }
365  else {
366  return RSTRING(str)->as.heap.aux.capa;
367  }
368 }
369 
370 static inline VALUE
372 {
373  NEWOBJ(str, struct RString);
374  OBJSETUP(str, klass, T_STRING);
375 
376  str->as.heap.ptr = 0;
377  str->as.heap.len = 0;
378  str->as.heap.aux.capa = 0;
379 
380  return (VALUE)str;
381 }
382 
383 static VALUE
384 str_new(VALUE klass, const char *ptr, long len)
385 {
386  VALUE str;
387 
388  if (len < 0) {
389  rb_raise(rb_eArgError, "negative string size (or size too big)");
390  }
391 
392  str = str_alloc(klass);
393  if (len > RSTRING_EMBED_LEN_MAX) {
394  RSTRING(str)->as.heap.aux.capa = len;
395  RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
396  STR_SET_NOEMBED(str);
397  }
398  else if (len == 0) {
400  }
401  if (ptr) {
402  memcpy(RSTRING_PTR(str), ptr, len);
403  }
404  STR_SET_LEN(str, len);
405  RSTRING_PTR(str)[len] = '\0';
406  return str;
407 }
408 
409 VALUE
410 rb_str_new(const char *ptr, long len)
411 {
412  return str_new(rb_cString, ptr, len);
413 }
414 
415 VALUE
416 rb_usascii_str_new(const char *ptr, long len)
417 {
418  VALUE str = rb_str_new(ptr, len);
420  return str;
421 }
422 
423 VALUE
424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
425 {
426  VALUE str = rb_str_new(ptr, len);
427  rb_enc_associate(str, enc);
428  return str;
429 }
430 
431 VALUE
432 rb_str_new_cstr(const char *ptr)
433 {
434  if (!ptr) {
435  rb_raise(rb_eArgError, "NULL pointer given");
436  }
437  return rb_str_new(ptr, strlen(ptr));
438 }
439 
441 #define rb_str_new2 rb_str_new_cstr
442 
443 VALUE
444 rb_usascii_str_new_cstr(const char *ptr)
445 {
446  VALUE str = rb_str_new2(ptr);
448  return str;
449 }
450 
452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
453 
454 VALUE
455 rb_tainted_str_new(const char *ptr, long len)
456 {
457  VALUE str = rb_str_new(ptr, len);
458 
459  OBJ_TAINT(str);
460  return str;
461 }
462 
463 VALUE
464 rb_tainted_str_new_cstr(const char *ptr)
465 {
466  VALUE str = rb_str_new2(ptr);
467 
468  OBJ_TAINT(str);
469  return str;
470 }
471 
473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
474 
475 VALUE
476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
477 {
478  rb_econv_t *ec;
479  rb_econv_result_t ret;
480  long len;
481  VALUE newstr;
482  const unsigned char *sp;
483  unsigned char *dp;
484 
485  if (!to) return str;
486  if (from == to) return str;
487  if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
488  to == rb_ascii8bit_encoding()) {
489  if (STR_ENC_GET(str) != to) {
490  str = rb_str_dup(str);
491  rb_enc_associate(str, to);
492  }
493  return str;
494  }
495 
496  len = RSTRING_LEN(str);
497  newstr = rb_str_new(0, len);
498 
499  retry:
500  ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
501  if (!ec) return str;
502 
503  sp = (unsigned char*)RSTRING_PTR(str);
504  dp = (unsigned char*)RSTRING_PTR(newstr);
505  ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
506  &dp, (unsigned char*)RSTRING_END(newstr), 0);
507  rb_econv_close(ec);
508  switch (ret) {
510  /* destination buffer short */
511  len = len < 2 ? 2 : len * 2;
512  rb_str_resize(newstr, len);
513  goto retry;
514 
515  case econv_finished:
516  len = dp - (unsigned char*)RSTRING_PTR(newstr);
517  rb_str_set_len(newstr, len);
518  rb_enc_associate(newstr, to);
519  return newstr;
520 
521  default:
522  /* some error, return original */
523  return str;
524  }
525 }
526 
527 VALUE
529 {
530  return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
531 }
532 
533 VALUE
534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
535 {
536  VALUE str;
537 
538  str = rb_tainted_str_new(ptr, len);
539  if (eenc == rb_usascii_encoding() &&
542  return str;
543  }
544  rb_enc_associate(str, eenc);
545  return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
546 }
547 
548 VALUE
549 rb_external_str_new(const char *ptr, long len)
550 {
552 }
553 
554 VALUE
555 rb_external_str_new_cstr(const char *ptr)
556 {
558 }
559 
560 VALUE
561 rb_locale_str_new(const char *ptr, long len)
562 {
564 }
565 
566 VALUE
567 rb_locale_str_new_cstr(const char *ptr)
568 {
570 }
571 
572 VALUE
573 rb_filesystem_str_new(const char *ptr, long len)
574 {
576 }
577 
578 VALUE
580 {
582 }
583 
584 VALUE
586 {
588 }
589 
590 VALUE
592 {
593  return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
594 }
595 
596 VALUE
598 {
599  return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
600 }
601 
602 static VALUE
604 {
605  if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
606  STR_SET_EMBED(str2);
607  memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
608  STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
609  }
610  else {
611  str = rb_str_new_frozen(str);
612  FL_SET(str2, STR_NOEMBED);
613  RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
614  RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
615  RSTRING(str2)->as.heap.aux.shared = str;
616  FL_SET(str2, ELTS_SHARED);
617  }
618  rb_enc_cr_str_exact_copy(str2, str);
619 
620  return str2;
621 }
622 
623 static VALUE
625 {
626  return str_replace_shared(str_alloc(klass), str);
627 }
628 
629 static VALUE
630 str_new3(VALUE klass, VALUE str)
631 {
632  return str_new_shared(klass, str);
633 }
634 
635 VALUE
637 {
638  VALUE str2 = str_new3(rb_obj_class(str), str);
639 
640  OBJ_INFECT(str2, str);
641  return str2;
642 }
643 
645 #define rb_str_new3 rb_str_new_shared
646 
647 static VALUE
648 str_new4(VALUE klass, VALUE str)
649 {
650  VALUE str2;
651 
652  str2 = str_alloc(klass);
653  STR_SET_NOEMBED(str2);
654  RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
655  RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
656  if (STR_SHARED_P(str)) {
657  VALUE shared = RSTRING(str)->as.heap.aux.shared;
658  assert(OBJ_FROZEN(shared));
659  FL_SET(str2, ELTS_SHARED);
660  RSTRING(str2)->as.heap.aux.shared = shared;
661  }
662  else {
663  FL_SET(str, ELTS_SHARED);
664  RSTRING(str)->as.heap.aux.shared = str2;
665  }
666  rb_enc_cr_str_exact_copy(str2, str);
667  OBJ_INFECT(str2, str);
668  return str2;
669 }
670 
671 VALUE
673 {
674  VALUE klass, str;
675 
676  if (OBJ_FROZEN(orig)) return orig;
677  klass = rb_obj_class(orig);
678  if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
679  long ofs;
680  assert(OBJ_FROZEN(str));
681  ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
682  if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
683  (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
684  ENCODING_GET(str) != ENCODING_GET(orig)) {
685  str = str_new3(klass, str);
686  RSTRING(str)->as.heap.ptr += ofs;
687  RSTRING(str)->as.heap.len -= ofs;
688  rb_enc_cr_str_exact_copy(str, orig);
689  OBJ_INFECT(str, orig);
690  }
691  }
692  else if (STR_EMBED_P(orig)) {
693  str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
694  rb_enc_cr_str_exact_copy(str, orig);
695  OBJ_INFECT(str, orig);
696  }
697  else if (STR_ASSOC_P(orig)) {
698  VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
699  FL_UNSET(orig, STR_ASSOC);
700  str = str_new4(klass, orig);
701  FL_SET(str, STR_ASSOC);
702  RSTRING(str)->as.heap.aux.shared = assoc;
703  }
704  else {
705  str = str_new4(klass, orig);
706  }
707  OBJ_FREEZE(str);
708  return str;
709 }
710 
712 #define rb_str_new4 rb_str_new_frozen
713 
714 VALUE
715 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
716 {
717  return str_new(rb_obj_class(obj), ptr, len);
718 }
719 
720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
721  rb_str_new_with_class, (obj, ptr, len))
722 #define rb_str_new5 rb_str_new_with_class
723 
724 static VALUE
725 str_new_empty(VALUE str)
726 {
727  VALUE v = rb_str_new5(str, 0, 0);
728  rb_enc_copy(v, str);
729  OBJ_INFECT(v, str);
730  return v;
731 }
732 
733 #define STR_BUF_MIN_SIZE 128
734 
735 VALUE
736 rb_str_buf_new(long capa)
737 {
738  VALUE str = str_alloc(rb_cString);
739 
740  if (capa < STR_BUF_MIN_SIZE) {
741  capa = STR_BUF_MIN_SIZE;
742  }
743  FL_SET(str, STR_NOEMBED);
744  RSTRING(str)->as.heap.aux.capa = capa;
745  RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
746  RSTRING(str)->as.heap.ptr[0] = '\0';
747 
748  return str;
749 }
750 
751 VALUE
752 rb_str_buf_new_cstr(const char *ptr)
753 {
754  VALUE str;
755  long len = strlen(ptr);
756 
757  str = rb_str_buf_new(len);
758  rb_str_buf_cat(str, ptr, len);
759 
760  return str;
761 }
762 
764 #define rb_str_buf_new2 rb_str_buf_new_cstr
765 
766 VALUE
767 rb_str_tmp_new(long len)
768 {
769  return str_new(0, 0, len);
770 }
771 
772 void *
773 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
774 {
775  VALUE s = rb_str_tmp_new(len);
776  *store = s;
777  return RSTRING_PTR(s);
778 }
779 
780 void
781 rb_free_tmp_buffer(volatile VALUE *store)
782 {
783  VALUE s = *store;
784  *store = 0;
785  if (s) rb_str_clear(s);
786 }
787 
788 void
790 {
791  if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
792  xfree(RSTRING(str)->as.heap.ptr);
793  }
794 }
795 
796 RUBY_FUNC_EXPORTED size_t
798 {
799  if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
800  return RSTRING(str)->as.heap.aux.capa;
801  }
802  else {
803  return 0;
804  }
805 }
806 
807 VALUE
809 {
810  return rb_convert_type(str, T_STRING, "String", "to_str");
811 }
812 
813 static inline void str_discard(VALUE str);
814 
815 void
817 {
818  rb_encoding *enc;
819  int cr;
820  if (str == str2) return;
821  enc = STR_ENC_GET(str2);
822  cr = ENC_CODERANGE(str2);
823  str_discard(str);
824  OBJ_INFECT(str, str2);
825  if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
826  STR_SET_EMBED(str);
827  memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
828  STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
829  rb_enc_associate(str, enc);
830  ENC_CODERANGE_SET(str, cr);
831  return;
832  }
833  STR_SET_NOEMBED(str);
834  STR_UNSET_NOCAPA(str);
835  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
836  RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
837  if (STR_NOCAPA_P(str2)) {
838  FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
839  RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
840  }
841  else {
842  RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
843  }
844  STR_SET_EMBED(str2); /* abandon str2 */
845  RSTRING_PTR(str2)[0] = 0;
846  STR_SET_EMBED_LEN(str2, 0);
847  rb_enc_associate(str, enc);
848  ENC_CODERANGE_SET(str, cr);
849 }
850 
851 static ID id_to_s;
852 
853 VALUE
855 {
856  VALUE str;
857 
858  if (TYPE(obj) == T_STRING) {
859  return obj;
860  }
861  str = rb_funcall(obj, id_to_s, 0);
862  if (TYPE(str) != T_STRING)
863  return rb_any_to_s(obj);
864  if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
865  return str;
866 }
867 
868 static VALUE
870 {
871  long len;
872 
873  len = RSTRING_LEN(str2);
874  if (STR_ASSOC_P(str2)) {
875  str2 = rb_str_new4(str2);
876  }
877  if (STR_SHARED_P(str2)) {
878  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
879  assert(OBJ_FROZEN(shared));
880  STR_SET_NOEMBED(str);
881  RSTRING(str)->as.heap.len = len;
882  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
883  FL_SET(str, ELTS_SHARED);
884  FL_UNSET(str, STR_ASSOC);
885  RSTRING(str)->as.heap.aux.shared = shared;
886  }
887  else {
888  str_replace_shared(str, str2);
889  }
890 
891  OBJ_INFECT(str, str2);
892  rb_enc_cr_str_exact_copy(str, str2);
893  return str;
894 }
895 
896 static VALUE
898 {
899  VALUE dup = str_alloc(klass);
900  str_replace(dup, str);
901  return dup;
902 }
903 
904 VALUE
906 {
907  return str_duplicate(rb_obj_class(str), str);
908 }
909 
910 VALUE
912 {
913  return str_replace(str_alloc(rb_cString), str);
914 }
915 
916 /*
917  * call-seq:
918  * String.new(str="") -> new_str
919  *
920  * Returns a new string object containing a copy of <i>str</i>.
921  */
922 
923 static VALUE
925 {
926  VALUE orig;
927 
928  if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
929  rb_str_replace(str, orig);
930  return str;
931 }
932 
933 static inline long
934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
935 {
936  long c;
937  const char *q;
938 
939  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
940  return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
941  }
942  else if (rb_enc_asciicompat(enc)) {
943  c = 0;
944  if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
945  while (p < e) {
946  if (ISASCII(*p)) {
947  q = search_nonascii(p, e);
948  if (!q)
949  return c + (e - p);
950  c += q - p;
951  p = q;
952  }
953  p += rb_enc_fast_mbclen(p, e, enc);
954  c++;
955  }
956  }
957  else {
958  while (p < e) {
959  if (ISASCII(*p)) {
960  q = search_nonascii(p, e);
961  if (!q)
962  return c + (e - p);
963  c += q - p;
964  p = q;
965  }
966  p += rb_enc_mbclen(p, e, enc);
967  c++;
968  }
969  }
970  return c;
971  }
972 
973  for (c=0; p<e; c++) {
974  p += rb_enc_mbclen(p, e, enc);
975  }
976  return c;
977 }
978 
979 long
980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
981 {
982  return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
983 }
984 
985 long
986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
987 {
988  long c;
989  const char *q;
990  int ret;
991 
992  *cr = 0;
993  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
994  return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
995  }
996  else if (rb_enc_asciicompat(enc)) {
997  c = 0;
998  while (p < e) {
999  if (ISASCII(*p)) {
1000  q = search_nonascii(p, e);
1001  if (!q) {
1002  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1003  return c + (e - p);
1004  }
1005  c += q - p;
1006  p = q;
1007  }
1008  ret = rb_enc_precise_mbclen(p, e, enc);
1009  if (MBCLEN_CHARFOUND_P(ret)) {
1010  *cr |= ENC_CODERANGE_VALID;
1011  p += MBCLEN_CHARFOUND_LEN(ret);
1012  }
1013  else {
1014  *cr = ENC_CODERANGE_BROKEN;
1015  p++;
1016  }
1017  c++;
1018  }
1019  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1020  return c;
1021  }
1022 
1023  for (c=0; p<e; c++) {
1024  ret = rb_enc_precise_mbclen(p, e, enc);
1025  if (MBCLEN_CHARFOUND_P(ret)) {
1026  *cr |= ENC_CODERANGE_VALID;
1027  p += MBCLEN_CHARFOUND_LEN(ret);
1028  }
1029  else {
1030  *cr = ENC_CODERANGE_BROKEN;
1031  if (p + rb_enc_mbminlen(enc) <= e)
1032  p += rb_enc_mbminlen(enc);
1033  else
1034  p = e;
1035  }
1036  }
1037  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1038  return c;
1039 }
1040 
1041 #ifdef NONASCII_MASK
1042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1043 
1044 /*
1045  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1046  * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
1047  * Therefore, following pseudo code can detect UTF-8 leading byte.
1048  *
1049  * if (!(byte & 0x80))
1050  * byte |= 0x40; // turn on bit6
1051  * return ((byte>>6) & 1); // bit6 represent it's leading byte or not.
1052  *
1053  * This function calculate every bytes in the argument word `s'
1054  * using the above logic concurrently. and gather every bytes result.
1055  */
1056 static inline VALUE
1057 count_utf8_lead_bytes_with_word(const VALUE *s)
1058 {
1059  VALUE d = *s;
1060 
1061  /* Transform into bit0 represent UTF-8 leading or not. */
1062  d |= ~(d>>1);
1063  d >>= 6;
1064  d &= NONASCII_MASK >> 7;
1065 
1066  /* Gather every bytes. */
1067  d += (d>>8);
1068  d += (d>>16);
1069 #if SIZEOF_VALUE == 8
1070  d += (d>>32);
1071 #endif
1072  return (d&0xF);
1073 }
1074 #endif
1075 
1076 static long
1078 {
1079  const char *p, *e;
1080  long n;
1081  int cr;
1082 
1083  if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1084  if (!enc) enc = STR_ENC_GET(str);
1085  p = RSTRING_PTR(str);
1086  e = RSTRING_END(str);
1087  cr = ENC_CODERANGE(str);
1088 #ifdef NONASCII_MASK
1089  if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1090  enc == rb_utf8_encoding()) {
1091 
1092  VALUE len = 0;
1093  if ((int)sizeof(VALUE) * 2 < e - p) {
1094  const VALUE *s, *t;
1095  const VALUE lowbits = sizeof(VALUE) - 1;
1096  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1097  t = (const VALUE*)(~lowbits & (VALUE)e);
1098  while (p < (const char *)s) {
1099  if (is_utf8_lead_byte(*p)) len++;
1100  p++;
1101  }
1102  while (s < t) {
1103  len += count_utf8_lead_bytes_with_word(s);
1104  s++;
1105  }
1106  p = (const char *)s;
1107  }
1108  while (p < e) {
1109  if (is_utf8_lead_byte(*p)) len++;
1110  p++;
1111  }
1112  return (long)len;
1113  }
1114 #endif
1115  n = rb_enc_strlen_cr(p, e, enc, &cr);
1116  if (cr) {
1117  ENC_CODERANGE_SET(str, cr);
1118  }
1119  return n;
1120 }
1121 
1122 long
1124 {
1125  return str_strlen(str, STR_ENC_GET(str));
1126 }
1127 
1128 /*
1129  * call-seq:
1130  * str.length -> integer
1131  * str.size -> integer
1132  *
1133  * Returns the character length of <i>str</i>.
1134  */
1135 
1136 VALUE
1138 {
1139  long len;
1140 
1141  len = str_strlen(str, STR_ENC_GET(str));
1142  return LONG2NUM(len);
1143 }
1144 
1145 /*
1146  * call-seq:
1147  * str.bytesize -> integer
1148  *
1149  * Returns the length of <i>str</i> in bytes.
1150  */
1151 
1152 static VALUE
1154 {
1155  return LONG2NUM(RSTRING_LEN(str));
1156 }
1157 
1158 /*
1159  * call-seq:
1160  * str.empty? -> true or false
1161  *
1162  * Returns <code>true</code> if <i>str</i> has a length of zero.
1163  *
1164  * "hello".empty? #=> false
1165  * "".empty? #=> true
1166  */
1167 
1168 static VALUE
1170 {
1171  if (RSTRING_LEN(str) == 0)
1172  return Qtrue;
1173  return Qfalse;
1174 }
1175 
1176 /*
1177  * call-seq:
1178  * str + other_str -> new_str
1179  *
1180  * Concatenation---Returns a new <code>String</code> containing
1181  * <i>other_str</i> concatenated to <i>str</i>.
1182  *
1183  * "Hello from " + self.to_s #=> "Hello from main"
1184  */
1185 
1186 VALUE
1188 {
1189  VALUE str3;
1190  rb_encoding *enc;
1191 
1192  StringValue(str2);
1193  enc = rb_enc_check(str1, str2);
1194  str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
1195  memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
1196  memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
1197  RSTRING_PTR(str2), RSTRING_LEN(str2));
1198  RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
1199 
1200  if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
1201  OBJ_TAINT(str3);
1204  return str3;
1205 }
1206 
1207 /*
1208  * call-seq:
1209  * str * integer -> new_str
1210  *
1211  * Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
1212  * the receiver.
1213  *
1214  * "Ho! " * 3 #=> "Ho! Ho! Ho! "
1215  */
1216 
1217 VALUE
1219 {
1220  VALUE str2;
1221  long n, len;
1222  char *ptr2;
1223 
1224  len = NUM2LONG(times);
1225  if (len < 0) {
1226  rb_raise(rb_eArgError, "negative argument");
1227  }
1228  if (len && LONG_MAX/len < RSTRING_LEN(str)) {
1229  rb_raise(rb_eArgError, "argument too big");
1230  }
1231 
1232  str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
1233  ptr2 = RSTRING_PTR(str2);
1234  if (len) {
1235  n = RSTRING_LEN(str);
1236  memcpy(ptr2, RSTRING_PTR(str), n);
1237  while (n <= len/2) {
1238  memcpy(ptr2 + n, ptr2, n);
1239  n *= 2;
1240  }
1241  memcpy(ptr2 + n, ptr2, len-n);
1242  }
1243  ptr2[RSTRING_LEN(str2)] = '\0';
1244  OBJ_INFECT(str2, str);
1245  rb_enc_cr_str_copy_for_substr(str2, str);
1246 
1247  return str2;
1248 }
1249 
1250 /*
1251  * call-seq:
1252  * str % arg -> new_str
1253  *
1254  * Format---Uses <i>str</i> as a format specification, and returns the result
1255  * of applying it to <i>arg</i>. If the format specification contains more than
1256  * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
1257  * containing the values to be substituted. See <code>Kernel::sprintf</code> for
1258  * details of the format string.
1259  *
1260  * "%05d" % 123 #=> "00123"
1261  * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6"
1262  * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
1263  */
1264 
1265 static VALUE
1267 {
1268  volatile VALUE tmp = rb_check_array_type(arg);
1269 
1270  if (!NIL_P(tmp)) {
1271  return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
1272  }
1273  return rb_str_format(1, &arg, str);
1274 }
1275 
1276 static inline void
1278 {
1279  if (FL_TEST(str, STR_TMPLOCK)) {
1280  rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1281  }
1282  rb_check_frozen(str);
1283  if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
1284  rb_raise(rb_eSecurityError, "Insecure: can't modify string");
1285 }
1286 
1287 static inline int
1289 {
1290  str_modifiable(str);
1291  if (!STR_SHARED_P(str)) return 1;
1292  if (STR_EMBED_P(str)) return 1;
1293  return 0;
1294 }
1295 
1296 static void
1298 {
1299  char *ptr;
1300  long len = RSTRING_LEN(str);
1301  long capa = len + expand;
1302 
1303  if (len > capa) len = capa;
1304  ptr = ALLOC_N(char, capa + 1);
1305  if (RSTRING_PTR(str)) {
1306  memcpy(ptr, RSTRING_PTR(str), len);
1307  }
1308  STR_SET_NOEMBED(str);
1309  STR_UNSET_NOCAPA(str);
1310  ptr[len] = 0;
1311  RSTRING(str)->as.heap.ptr = ptr;
1312  RSTRING(str)->as.heap.len = len;
1313  RSTRING(str)->as.heap.aux.capa = capa;
1314 }
1315 
1316 #define str_make_independent(str) str_make_independent_expand((str), 0L)
1317 
1318 void
1320 {
1321  if (!str_independent(str))
1322  str_make_independent(str);
1323  ENC_CODERANGE_CLEAR(str);
1324 }
1325 
1326 void
1327 rb_str_modify_expand(VALUE str, long expand)
1328 {
1329  if (expand < 0) {
1330  rb_raise(rb_eArgError, "negative expanding string size");
1331  }
1332  if (!str_independent(str)) {
1333  str_make_independent_expand(str, expand);
1334  }
1335  else if (expand > 0) {
1336  long len = RSTRING_LEN(str);
1337  long capa = len + expand;
1338  if (!STR_EMBED_P(str)) {
1339  REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
1340  STR_UNSET_NOCAPA(str);
1341  RSTRING(str)->as.heap.aux.capa = capa;
1342  }
1343  else if (capa > RSTRING_EMBED_LEN_MAX) {
1344  str_make_independent_expand(str, expand);
1345  }
1346  }
1347  ENC_CODERANGE_CLEAR(str);
1348 }
1349 
1350 /* As rb_str_modify(), but don't clear coderange */
1351 static void
1353 {
1354  if (!str_independent(str))
1355  str_make_independent(str);
1356  if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
1357  /* Force re-scan later */
1358  ENC_CODERANGE_CLEAR(str);
1359 }
1360 
1361 static inline void
1363 {
1364  str_modifiable(str);
1365  if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
1366  xfree(RSTRING_PTR(str));
1367  RSTRING(str)->as.heap.ptr = 0;
1368  RSTRING(str)->as.heap.len = 0;
1369  }
1370 }
1371 
1372 void
1374 {
1375  /* sanity check */
1376  rb_check_frozen(str);
1377  if (STR_ASSOC_P(str)) {
1378  /* already associated */
1379  rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1380  }
1381  else {
1382  if (STR_SHARED_P(str)) {
1383  VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1384  str_make_independent(str);
1385  if (STR_ASSOC_P(assoc)) {
1386  assoc = RSTRING(assoc)->as.heap.aux.shared;
1387  rb_ary_concat(assoc, add);
1388  add = assoc;
1389  }
1390  }
1391  else if (STR_EMBED_P(str)) {
1392  str_make_independent(str);
1393  }
1394  else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1395  RESIZE_CAPA(str, RSTRING_LEN(str));
1396  }
1397  FL_SET(str, STR_ASSOC);
1398  RBASIC(add)->klass = 0;
1399  RSTRING(str)->as.heap.aux.shared = add;
1400  }
1401 }
1402 
1403 VALUE
1405 {
1406  if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1407  if (STR_ASSOC_P(str)) {
1408  return RSTRING(str)->as.heap.aux.shared;
1409  }
1410  return Qfalse;
1411 }
1412 
1413 VALUE
1414 rb_string_value(volatile VALUE *ptr)
1415 {
1416  VALUE s = *ptr;
1417  if (TYPE(s) != T_STRING) {
1418  s = rb_str_to_str(s);
1419  *ptr = s;
1420  }
1421  return s;
1422 }
1423 
1424 char *
1426 {
1427  VALUE str = rb_string_value(ptr);
1428  return RSTRING_PTR(str);
1429 }
1430 
1431 char *
1433 {
1434  VALUE str = rb_string_value(ptr);
1435  char *s = RSTRING_PTR(str);
1436  long len = RSTRING_LEN(str);
1437 
1438  if (!s || memchr(s, 0, len)) {
1439  rb_raise(rb_eArgError, "string contains null byte");
1440  }
1441  if (s[len]) {
1442  rb_str_modify(str);
1443  s = RSTRING_PTR(str);
1444  s[RSTRING_LEN(str)] = 0;
1445  }
1446  return s;
1447 }
1448 
1449 VALUE
1451 {
1452  str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1453  return str;
1454 }
1455 
1456 /*
1457  * call-seq:
1458  * String.try_convert(obj) -> string or nil
1459  *
1460  * Try to convert <i>obj</i> into a String, using to_str method.
1461  * Returns converted string or nil if <i>obj</i> cannot be converted
1462  * for any reason.
1463  *
1464  * String.try_convert("str") #=> "str"
1465  * String.try_convert(/re/) #=> nil
1466  */
1467 static VALUE
1469 {
1470  return rb_check_string_type(str);
1471 }
1472 
1473 static char*
1474 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
1475 {
1476  long nth = *nthp;
1477  if (rb_enc_mbmaxlen(enc) == 1) {
1478  p += nth;
1479  }
1480  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1481  p += nth * rb_enc_mbmaxlen(enc);
1482  }
1483  else if (rb_enc_asciicompat(enc)) {
1484  const char *p2, *e2;
1485  int n;
1486 
1487  while (p < e && 0 < nth) {
1488  e2 = p + nth;
1489  if (e < e2) {
1490  *nthp = nth;
1491  return (char *)e;
1492  }
1493  if (ISASCII(*p)) {
1494  p2 = search_nonascii(p, e2);
1495  if (!p2) {
1496  *nthp = nth;
1497  return (char *)e2;
1498  }
1499  nth -= p2 - p;
1500  p = p2;
1501  }
1502  n = rb_enc_mbclen(p, e, enc);
1503  p += n;
1504  nth--;
1505  }
1506  *nthp = nth;
1507  if (nth != 0) {
1508  return (char *)e;
1509  }
1510  return (char *)p;
1511  }
1512  else {
1513  while (p < e && nth--) {
1514  p += rb_enc_mbclen(p, e, enc);
1515  }
1516  }
1517  if (p > e) p = e;
1518  *nthp = nth;
1519  return (char*)p;
1520 }
1521 
1522 char*
1523 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
1524 {
1525  return str_nth_len(p, e, &nth, enc);
1526 }
1527 
1528 static char*
1529 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1530 {
1531  if (singlebyte)
1532  p += nth;
1533  else {
1534  p = str_nth_len(p, e, &nth, enc);
1535  }
1536  if (!p) return 0;
1537  if (p > e) p = e;
1538  return (char *)p;
1539 }
1540 
1541 /* char offset to byte offset */
1542 static long
1543 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1544 {
1545  const char *pp = str_nth(p, e, nth, enc, singlebyte);
1546  if (!pp) return e - p;
1547  return pp - p;
1548 }
1549 
1550 long
1551 rb_str_offset(VALUE str, long pos)
1552 {
1553  return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
1555 }
1556 
1557 #ifdef NONASCII_MASK
1558 static char *
1559 str_utf8_nth(const char *p, const char *e, long *nthp)
1560 {
1561  long nth = *nthp;
1562  if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
1563  const VALUE *s, *t;
1564  const VALUE lowbits = sizeof(VALUE) - 1;
1565  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1566  t = (const VALUE*)(~lowbits & (VALUE)e);
1567  while (p < (const char *)s) {
1568  if (is_utf8_lead_byte(*p)) nth--;
1569  p++;
1570  }
1571  do {
1572  nth -= count_utf8_lead_bytes_with_word(s);
1573  s++;
1574  } while (s < t && (int)sizeof(VALUE) <= nth);
1575  p = (char *)s;
1576  }
1577  while (p < e) {
1578  if (is_utf8_lead_byte(*p)) {
1579  if (nth == 0) break;
1580  nth--;
1581  }
1582  p++;
1583  }
1584  *nthp = nth;
1585  return (char *)p;
1586 }
1587 
1588 static long
1589 str_utf8_offset(const char *p, const char *e, long nth)
1590 {
1591  const char *pp = str_utf8_nth(p, e, &nth);
1592  return pp - p;
1593 }
1594 #endif
1595 
1596 /* byte offset to char offset */
1597 long
1598 rb_str_sublen(VALUE str, long pos)
1599 {
1600  if (single_byte_optimizable(str) || pos < 0)
1601  return pos;
1602  else {
1603  char *p = RSTRING_PTR(str);
1604  return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
1605  }
1606 }
1607 
1608 VALUE
1609 rb_str_subseq(VALUE str, long beg, long len)
1610 {
1611  VALUE str2;
1612 
1613  if (RSTRING_LEN(str) == beg + len &&
1614  RSTRING_EMBED_LEN_MAX < len) {
1615  str2 = rb_str_new_shared(rb_str_new_frozen(str));
1616  rb_str_drop_bytes(str2, beg);
1617  }
1618  else {
1619  str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1620  }
1621 
1622  rb_enc_cr_str_copy_for_substr(str2, str);
1623  OBJ_INFECT(str2, str);
1624 
1625  return str2;
1626 }
1627 
1628 VALUE
1629 rb_str_substr(VALUE str, long beg, long len)
1630 {
1631  rb_encoding *enc = STR_ENC_GET(str);
1632  VALUE str2;
1633  char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
1634 
1635  if (len < 0) return Qnil;
1636  if (!RSTRING_LEN(str)) {
1637  len = 0;
1638  }
1639  if (single_byte_optimizable(str)) {
1640  if (beg > RSTRING_LEN(str)) return Qnil;
1641  if (beg < 0) {
1642  beg += RSTRING_LEN(str);
1643  if (beg < 0) return Qnil;
1644  }
1645  if (beg + len > RSTRING_LEN(str))
1646  len = RSTRING_LEN(str) - beg;
1647  if (len <= 0) {
1648  len = 0;
1649  p = 0;
1650  }
1651  else
1652  p = s + beg;
1653  goto sub;
1654  }
1655  if (beg < 0) {
1656  if (len > -beg) len = -beg;
1657  if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1658  beg = -beg;
1659  while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
1660  p = e;
1661  if (!p) return Qnil;
1662  while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
1663  if (!p) return Qnil;
1664  len = e - p;
1665  goto sub;
1666  }
1667  else {
1668  beg += str_strlen(str, enc);
1669  if (beg < 0) return Qnil;
1670  }
1671  }
1672  else if (beg > 0 && beg > RSTRING_LEN(str)) {
1673  return Qnil;
1674  }
1675  if (len == 0) {
1676  if (beg > str_strlen(str, enc)) return Qnil;
1677  p = 0;
1678  }
1679 #ifdef NONASCII_MASK
1680  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1681  enc == rb_utf8_encoding()) {
1682  p = str_utf8_nth(s, e, &beg);
1683  if (beg > 0) return Qnil;
1684  len = str_utf8_offset(p, e, len);
1685  }
1686 #endif
1687  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1688  int char_sz = rb_enc_mbmaxlen(enc);
1689 
1690  p = s + beg * char_sz;
1691  if (p > e) {
1692  return Qnil;
1693  }
1694  else if (len * char_sz > e - p)
1695  len = e - p;
1696  else
1697  len *= char_sz;
1698  }
1699  else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
1700  if (beg > 0) return Qnil;
1701  len = 0;
1702  }
1703  else {
1704  len = str_offset(p, e, len, enc, 0);
1705  }
1706  sub:
1707  if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
1708  str2 = rb_str_new4(str);
1709  str2 = str_new3(rb_obj_class(str2), str2);
1710  RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1711  RSTRING(str2)->as.heap.len = len;
1712  }
1713  else {
1714  str2 = rb_str_new5(str, p, len);
1715  rb_enc_cr_str_copy_for_substr(str2, str);
1716  OBJ_INFECT(str2, str);
1717  }
1718 
1719  return str2;
1720 }
1721 
1722 VALUE
1724 {
1725  if (STR_ASSOC_P(str)) {
1726  VALUE ary = RSTRING(str)->as.heap.aux.shared;
1727  OBJ_FREEZE(ary);
1728  }
1729  return rb_obj_freeze(str);
1730 }
1731 
1733 #define rb_str_dup_frozen rb_str_new_frozen
1734 
1735 VALUE
1736 rb_str_locktmp(VALUE str)
1737 {
1738  if (FL_TEST(str, STR_TMPLOCK)) {
1739  rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1740  }
1741  FL_SET(str, STR_TMPLOCK);
1742  return str;
1743 }
1744 
1745 VALUE
1747 {
1748  if (!FL_TEST(str, STR_TMPLOCK)) {
1749  rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1750  }
1751  FL_UNSET(str, STR_TMPLOCK);
1752  return str;
1753 }
1754 
1755 VALUE
1757 {
1758  rb_str_locktmp(str);
1759  return rb_ensure(func, arg, rb_str_unlocktmp, str);
1760 }
1761 
1762 void
1763 rb_str_set_len(VALUE str, long len)
1764 {
1765  long capa;
1766 
1767  str_modifiable(str);
1768  if (STR_SHARED_P(str)) {
1769  rb_raise(rb_eRuntimeError, "can't set length of shared string");
1770  }
1771  if (len > (capa = (long)rb_str_capacity(str))) {
1772  rb_bug("probable buffer overflow: %ld for %ld", len, capa);
1773  }
1774  STR_SET_LEN(str, len);
1775  RSTRING_PTR(str)[len] = '\0';
1776 }
1777 
1778 VALUE
1779 rb_str_resize(VALUE str, long len)
1780 {
1781  long slen;
1782  int independent;
1783 
1784  if (len < 0) {
1785  rb_raise(rb_eArgError, "negative string size (or size too big)");
1786  }
1787 
1788  independent = str_independent(str);
1789  ENC_CODERANGE_CLEAR(str);
1790  slen = RSTRING_LEN(str);
1791  if (len != slen) {
1792  if (STR_EMBED_P(str)) {
1793  if (len <= RSTRING_EMBED_LEN_MAX) {
1794  STR_SET_EMBED_LEN(str, len);
1795  RSTRING(str)->as.ary[len] = '\0';
1796  return str;
1797  }
1798  str_make_independent_expand(str, len - slen);
1799  STR_SET_NOEMBED(str);
1800  }
1801  else if (len <= RSTRING_EMBED_LEN_MAX) {
1802  char *ptr = RSTRING(str)->as.heap.ptr;
1803  STR_SET_EMBED(str);
1804  if (slen > len) slen = len;
1805  if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
1806  RSTRING(str)->as.ary[len] = '\0';
1807  STR_SET_EMBED_LEN(str, len);
1808  if (independent) xfree(ptr);
1809  return str;
1810  }
1811  else if (!independent) {
1812  str_make_independent_expand(str, len - slen);
1813  }
1814  else if (slen < len || slen - len > 1024) {
1815  REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1816  }
1817  if (!STR_NOCAPA_P(str)) {
1818  RSTRING(str)->as.heap.aux.capa = len;
1819  }
1820  RSTRING(str)->as.heap.len = len;
1821  RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */
1822  }
1823  return str;
1824 }
1825 
1826 static VALUE
1827 str_buf_cat(VALUE str, const char *ptr, long len)
1828 {
1829  long capa, total, off = -1;
1830 
1831  if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1832  off = ptr - RSTRING_PTR(str);
1833  }
1834  rb_str_modify(str);
1835  if (len == 0) return 0;
1836  if (STR_ASSOC_P(str)) {
1837  FL_UNSET(str, STR_ASSOC);
1838  capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1839  }
1840  else if (STR_EMBED_P(str)) {
1841  capa = RSTRING_EMBED_LEN_MAX;
1842  }
1843  else {
1844  capa = RSTRING(str)->as.heap.aux.capa;
1845  }
1846  if (RSTRING_LEN(str) >= LONG_MAX - len) {
1847  rb_raise(rb_eArgError, "string sizes too big");
1848  }
1849  total = RSTRING_LEN(str)+len;
1850  if (capa <= total) {
1851  while (total > capa) {
1852  if (capa + 1 >= LONG_MAX / 2) {
1853  capa = (total + 4095) / 4096;
1854  break;
1855  }
1856  capa = (capa + 1) * 2;
1857  }
1858  RESIZE_CAPA(str, capa);
1859  }
1860  if (off != -1) {
1861  ptr = RSTRING_PTR(str) + off;
1862  }
1863  memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1864  STR_SET_LEN(str, total);
1865  RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1866 
1867  return str;
1868 }
1869 
1870 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
1871 
1872 VALUE
1873 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1874 {
1875  if (len == 0) return str;
1876  if (len < 0) {
1877  rb_raise(rb_eArgError, "negative string size (or size too big)");
1878  }
1879  return str_buf_cat(str, ptr, len);
1880 }
1881 
1882 VALUE
1883 rb_str_buf_cat2(VALUE str, const char *ptr)
1884 {
1885  return rb_str_buf_cat(str, ptr, strlen(ptr));
1886 }
1887 
1888 VALUE
1889 rb_str_cat(VALUE str, const char *ptr, long len)
1890 {
1891  if (len < 0) {
1892  rb_raise(rb_eArgError, "negative string size (or size too big)");
1893  }
1894  if (STR_ASSOC_P(str)) {
1895  char *p;
1896  rb_str_modify_expand(str, len);
1897  p = RSTRING(str)->as.heap.ptr;
1898  memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
1899  len = RSTRING(str)->as.heap.len += len;
1900  p[len] = '\0'; /* sentinel */
1901  return str;
1902  }
1903 
1904  return rb_str_buf_cat(str, ptr, len);
1905 }
1906 
1907 VALUE
1908 rb_str_cat2(VALUE str, const char *ptr)
1909 {
1910  return rb_str_cat(str, ptr, strlen(ptr));
1911 }
1912 
1913 static VALUE
1914 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1915  int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1916 {
1917  int str_encindex = ENCODING_GET(str);
1918  int res_encindex;
1919  int str_cr, res_cr;
1920 
1921  str_cr = ENC_CODERANGE(str);
1922 
1923  if (str_encindex == ptr_encindex) {
1924  if (str_cr == ENC_CODERANGE_UNKNOWN)
1925  ptr_cr = ENC_CODERANGE_UNKNOWN;
1926  else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1927  ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1928  }
1929  }
1930  else {
1931  rb_encoding *str_enc = rb_enc_from_index(str_encindex);
1932  rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
1933  if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
1934  if (len == 0)
1935  return str;
1936  if (RSTRING_LEN(str) == 0) {
1937  rb_str_buf_cat(str, ptr, len);
1938  ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
1939  return str;
1940  }
1941  goto incompatible;
1942  }
1943  if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1944  ptr_cr = coderange_scan(ptr, len, ptr_enc);
1945  }
1946  if (str_cr == ENC_CODERANGE_UNKNOWN) {
1947  if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
1948  str_cr = rb_enc_str_coderange(str);
1949  }
1950  }
1951  }
1952  if (ptr_cr_ret)
1953  *ptr_cr_ret = ptr_cr;
1954 
1955  if (str_encindex != ptr_encindex &&
1956  str_cr != ENC_CODERANGE_7BIT &&
1957  ptr_cr != ENC_CODERANGE_7BIT) {
1958  incompatible:
1959  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
1960  rb_enc_name(rb_enc_from_index(str_encindex)),
1961  rb_enc_name(rb_enc_from_index(ptr_encindex)));
1962  }
1963 
1964  if (str_cr == ENC_CODERANGE_UNKNOWN) {
1965  res_encindex = str_encindex;
1966  res_cr = ENC_CODERANGE_UNKNOWN;
1967  }
1968  else if (str_cr == ENC_CODERANGE_7BIT) {
1969  if (ptr_cr == ENC_CODERANGE_7BIT) {
1970  res_encindex = str_encindex;
1971  res_cr = ENC_CODERANGE_7BIT;
1972  }
1973  else {
1974  res_encindex = ptr_encindex;
1975  res_cr = ptr_cr;
1976  }
1977  }
1978  else if (str_cr == ENC_CODERANGE_VALID) {
1979  res_encindex = str_encindex;
1980  if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
1981  res_cr = str_cr;
1982  else
1983  res_cr = ptr_cr;
1984  }
1985  else { /* str_cr == ENC_CODERANGE_BROKEN */
1986  res_encindex = str_encindex;
1987  res_cr = str_cr;
1988  if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
1989  }
1990 
1991  if (len < 0) {
1992  rb_raise(rb_eArgError, "negative string size (or size too big)");
1993  }
1994  str_buf_cat(str, ptr, len);
1995  ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1996  return str;
1997 }
1998 
1999 VALUE
2000 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2001 {
2002  return rb_enc_cr_str_buf_cat(str, ptr, len,
2004 }
2005 
2006 VALUE
2007 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2008 {
2009  /* ptr must reference NUL terminated ASCII string. */
2010  int encindex = ENCODING_GET(str);
2011  rb_encoding *enc = rb_enc_from_index(encindex);
2012  if (rb_enc_asciicompat(enc)) {
2013  return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2014  encindex, ENC_CODERANGE_7BIT, 0);
2015  }
2016  else {
2017  char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2018  while (*ptr) {
2019  unsigned int c = (unsigned char)*ptr;
2020  int len = rb_enc_codelen(c, enc);
2021  rb_enc_mbcput(c, buf, enc);
2022  rb_enc_cr_str_buf_cat(str, buf, len,
2023  encindex, ENC_CODERANGE_VALID, 0);
2024  ptr++;
2025  }
2026  return str;
2027  }
2028 }
2029 
2030 VALUE
2032 {
2033  int str2_cr;
2034 
2035  str2_cr = ENC_CODERANGE(str2);
2036 
2037  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2038  ENCODING_GET(str2), str2_cr, &str2_cr);
2039 
2040  OBJ_INFECT(str, str2);
2041  ENC_CODERANGE_SET(str2, str2_cr);
2042 
2043  return str;
2044 }
2045 
2046 VALUE
2048 {
2049  rb_encoding *enc;
2050  int cr, cr2;
2051  long len2;
2052 
2053  StringValue(str2);
2054  if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
2055  long len = RSTRING_LEN(str) + len2;
2056  enc = rb_enc_check(str, str2);
2057  cr = ENC_CODERANGE(str);
2058  if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
2059  rb_str_modify_expand(str, len2);
2060  memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
2061  RSTRING_PTR(str2), len2+1);
2062  RSTRING(str)->as.heap.len = len;
2063  rb_enc_associate(str, enc);
2064  ENC_CODERANGE_SET(str, cr);
2065  OBJ_INFECT(str, str2);
2066  return str;
2067  }
2068  return rb_str_buf_append(str, str2);
2069 }
2070 
2071 /*
2072  * call-seq:
2073  * str << integer -> str
2074  * str.concat(integer) -> str
2075  * str << obj -> str
2076  * str.concat(obj) -> str
2077  *
2078  * Append---Concatenates the given object to <i>str</i>. If the object is a
2079  * <code>Integer</code>, it is considered as a codepoint, and is converted
2080  * to a character before concatenation.
2081  *
2082  * a = "hello "
2083  * a << "world" #=> "hello world"
2084  * a.concat(33) #=> "hello world!"
2085  */
2086 
2087 VALUE
2089 {
2090  unsigned int code;
2091  rb_encoding *enc = STR_ENC_GET(str1);
2092 
2093  if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
2094  if (rb_num_to_uint(str2, &code) == 0) {
2095  }
2096  else if (FIXNUM_P(str2)) {
2097  rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
2098  }
2099  else {
2100  rb_raise(rb_eRangeError, "bignum out of char range");
2101  }
2102  }
2103  else {
2104  return rb_str_append(str1, str2);
2105  }
2106 
2107  if (enc == rb_usascii_encoding()) {
2108  /* US-ASCII automatically extended to ASCII-8BIT */
2109  char buf[1];
2110  buf[0] = (char)code;
2111  if (code > 0xFF) {
2112  rb_raise(rb_eRangeError, "%u out of char range", code);
2113  }
2114  rb_str_cat(str1, buf, 1);
2115  if (code > 127) {
2118  }
2119  }
2120  else {
2121  long pos = RSTRING_LEN(str1);
2122  int cr = ENC_CODERANGE(str1);
2123  int len;
2124  char *buf;
2125 
2126  switch (len = rb_enc_codelen(code, enc)) {
2128  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2129  break;
2131  case 0:
2132  rb_raise(rb_eRangeError, "%u out of char range", code);
2133  break;
2134  }
2135  buf = ALLOCA_N(char, len + 1);
2136  rb_enc_mbcput(code, buf, enc);
2137  if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
2138  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2139  }
2140  rb_str_resize(str1, pos+len);
2141  strncpy(RSTRING_PTR(str1) + pos, buf, len);
2142  if (cr == ENC_CODERANGE_7BIT && code > 127)
2143  cr = ENC_CODERANGE_VALID;
2144  ENC_CODERANGE_SET(str1, cr);
2145  }
2146  return str1;
2147 }
2148 
2149 /*
2150  * call-seq:
2151  * str.prepend(other_str) -> str
2152  *
2153  * Prepend---Prepend the given string to <i>str</i>.
2154  *
2155  * a = "world"
2156  * a.prepend("hello ") #=> "hello world"
2157  * a #=> "hello world"
2158  */
2159 
2160 static VALUE
2162 {
2163  StringValue(str2);
2164  StringValue(str);
2165  rb_str_update(str, 0L, 0L, str2);
2166  return str;
2167 }
2168 
2169 st_index_t
2171 {
2172  int e = ENCODING_GET(str);
2173  if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
2174  e = 0;
2175  }
2176  return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
2177 }
2178 
2179 int
2181 {
2182  long len;
2183 
2184  if (!rb_str_comparable(str1, str2)) return 1;
2185  if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
2186  memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
2187  return 0;
2188  }
2189  return 1;
2190 }
2191 
2192 /*
2193  * call-seq:
2194  * str.hash -> fixnum
2195  *
2196  * Return a hash based on the string's length and content.
2197  */
2198 
2199 static VALUE
2201 {
2202  st_index_t hval = rb_str_hash(str);
2203  return INT2FIX(hval);
2204 }
2205 
2206 #define lesser(a,b) (((a)>(b))?(b):(a))
2207 
2208 int
2210 {
2211  int idx1, idx2;
2212  int rc1, rc2;
2213 
2214  if (RSTRING_LEN(str1) == 0) return TRUE;
2215  if (RSTRING_LEN(str2) == 0) return TRUE;
2216  idx1 = ENCODING_GET(str1);
2217  idx2 = ENCODING_GET(str2);
2218  if (idx1 == idx2) return TRUE;
2219  rc1 = rb_enc_str_coderange(str1);
2220  rc2 = rb_enc_str_coderange(str2);
2221  if (rc1 == ENC_CODERANGE_7BIT) {
2222  if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
2224  return TRUE;
2225  }
2226  if (rc2 == ENC_CODERANGE_7BIT) {
2228  return TRUE;
2229  }
2230  return FALSE;
2231 }
2232 
2233 int
2235 {
2236  long len1, len2;
2237  const char *ptr1, *ptr2;
2238  int retval;
2239 
2240  if (str1 == str2) return 0;
2241  RSTRING_GETMEM(str1, ptr1, len1);
2242  RSTRING_GETMEM(str2, ptr2, len2);
2243  if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
2244  if (len1 == len2) {
2245  if (!rb_str_comparable(str1, str2)) {
2246  if (ENCODING_GET(str1) > ENCODING_GET(str2))
2247  return 1;
2248  return -1;
2249  }
2250  return 0;
2251  }
2252  if (len1 > len2) return 1;
2253  return -1;
2254  }
2255  if (retval > 0) return 1;
2256  return -1;
2257 }
2258 
2259 /* expect tail call optimization */
2260 static VALUE
2261 str_eql(const VALUE str1, const VALUE str2)
2262 {
2263  const long len = RSTRING_LEN(str1);
2264  const char *ptr1, *ptr2;
2265 
2266  if (len != RSTRING_LEN(str2)) return Qfalse;
2267  if (!rb_str_comparable(str1, str2)) return Qfalse;
2268  if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
2269  return Qtrue;
2270  if (memcmp(ptr1, ptr2, len) == 0)
2271  return Qtrue;
2272  return Qfalse;
2273 }
2274 /*
2275  * call-seq:
2276  * str == obj -> true or false
2277  *
2278  * Equality---If <i>obj</i> is not a <code>String</code>, returns
2279  * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
2280  * <code><=></code> <i>obj</i> returns zero.
2281  */
2282 
2283 VALUE
2285 {
2286  if (str1 == str2) return Qtrue;
2287  if (TYPE(str2) != T_STRING) {
2288  if (!rb_respond_to(str2, rb_intern("to_str"))) {
2289  return Qfalse;
2290  }
2291  return rb_equal(str2, str1);
2292  }
2293  return str_eql(str1, str2);
2294 }
2295 
2296 /*
2297  * call-seq:
2298  * str.eql?(other) -> true or false
2299  *
2300  * Two strings are equal if they have the same length and content.
2301  */
2302 
2303 static VALUE
2305 {
2306  if (str1 == str2) return Qtrue;
2307  if (TYPE(str2) != T_STRING) return Qfalse;
2308  return str_eql(str1, str2);
2309 }
2310 
2311 /*
2312  * call-seq:
2313  * str <=> other_str -> -1, 0, +1 or nil
2314  *
2315  * Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if
2316  * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than
2317  * <i>str</i>. If the strings are of different lengths, and the strings are
2318  * equal when compared up to the shortest length, then the longer string is
2319  * considered greater than the shorter one. In older versions of Ruby, setting
2320  * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
2321  * in favor of using <code>String#casecmp</code>.
2322  *
2323  * <code><=></code> is the basis for the methods <code><</code>,
2324  * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
2325  * included from module <code>Comparable</code>. The method
2326  * <code>String#==</code> does not use <code>Comparable#==</code>.
2327  *
2328  * "abcdef" <=> "abcde" #=> 1
2329  * "abcdef" <=> "abcdef" #=> 0
2330  * "abcdef" <=> "abcdefg" #=> -1
2331  * "abcdef" <=> "ABCDEF" #=> 1
2332  */
2333 
2334 static VALUE
2336 {
2337  long result;
2338 
2339  if (TYPE(str2) != T_STRING) {
2340  if (!rb_respond_to(str2, rb_intern("to_str"))) {
2341  return Qnil;
2342  }
2343  else if (!rb_respond_to(str2, rb_intern("<=>"))) {
2344  return Qnil;
2345  }
2346  else {
2347  VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
2348 
2349  if (NIL_P(tmp)) return Qnil;
2350  if (!FIXNUM_P(tmp)) {
2351  return rb_funcall(LONG2FIX(0), '-', 1, tmp);
2352  }
2353  result = -FIX2LONG(tmp);
2354  }
2355  }
2356  else {
2357  result = rb_str_cmp(str1, str2);
2358  }
2359  return LONG2NUM(result);
2360 }
2361 
2362 /*
2363  * call-seq:
2364  * str.casecmp(other_str) -> -1, 0, +1 or nil
2365  *
2366  * Case-insensitive version of <code>String#<=></code>.
2367  *
2368  * "abcdef".casecmp("abcde") #=> 1
2369  * "aBcDeF".casecmp("abcdef") #=> 0
2370  * "abcdef".casecmp("abcdefg") #=> -1
2371  * "abcdef".casecmp("ABCDEF") #=> 0
2372  */
2373 
2374 static VALUE
2376 {
2377  long len;
2378  rb_encoding *enc;
2379  char *p1, *p1end, *p2, *p2end;
2380 
2381  StringValue(str2);
2382  enc = rb_enc_compatible(str1, str2);
2383  if (!enc) {
2384  return Qnil;
2385  }
2386 
2387  p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2388  p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2389  if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
2390  while (p1 < p1end && p2 < p2end) {
2391  if (*p1 != *p2) {
2392  unsigned int c1 = TOUPPER(*p1 & 0xff);
2393  unsigned int c2 = TOUPPER(*p2 & 0xff);
2394  if (c1 != c2)
2395  return INT2FIX(c1 < c2 ? -1 : 1);
2396  }
2397  p1++;
2398  p2++;
2399  }
2400  }
2401  else {
2402  while (p1 < p1end && p2 < p2end) {
2403  int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
2404  int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
2405 
2406  if (0 <= c1 && 0 <= c2) {
2407  c1 = TOUPPER(c1);
2408  c2 = TOUPPER(c2);
2409  if (c1 != c2)
2410  return INT2FIX(c1 < c2 ? -1 : 1);
2411  }
2412  else {
2413  int r;
2414  l1 = rb_enc_mbclen(p1, p1end, enc);
2415  l2 = rb_enc_mbclen(p2, p2end, enc);
2416  len = l1 < l2 ? l1 : l2;
2417  r = memcmp(p1, p2, len);
2418  if (r != 0)
2419  return INT2FIX(r < 0 ? -1 : 1);
2420  if (l1 != l2)
2421  return INT2FIX(l1 < l2 ? -1 : 1);
2422  }
2423  p1 += l1;
2424  p2 += l2;
2425  }
2426  }
2427  if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2428  if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2429  return INT2FIX(-1);
2430 }
2431 
2432 static long
2433 rb_str_index(VALUE str, VALUE sub, long offset)
2434 {
2435  long pos;
2436  char *s, *sptr, *e;
2437  long len, slen;
2438  rb_encoding *enc;
2439 
2440  enc = rb_enc_check(str, sub);
2441  if (is_broken_string(sub)) {
2442  return -1;
2443  }
2444  len = str_strlen(str, enc);
2445  slen = str_strlen(sub, enc);
2446  if (offset < 0) {
2447  offset += len;
2448  if (offset < 0) return -1;
2449  }
2450  if (len - offset < slen) return -1;
2451  s = RSTRING_PTR(str);
2452  e = s + RSTRING_LEN(str);
2453  if (offset) {
2454  offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2455  s += offset;
2456  }
2457  if (slen == 0) return offset;
2458  /* need proceed one character at a time */
2459  sptr = RSTRING_PTR(sub);
2460  slen = RSTRING_LEN(sub);
2461  len = RSTRING_LEN(str) - offset;
2462  for (;;) {
2463  char *t;
2464  pos = rb_memsearch(sptr, slen, s, len, enc);
2465  if (pos < 0) return pos;
2466  t = rb_enc_right_char_head(s, s+pos, e, enc);
2467  if (t == s + pos) break;
2468  if ((len -= t - s) <= 0) return -1;
2469  offset += t - s;
2470  s = t;
2471  }
2472  return pos + offset;
2473 }
2474 
2475 
2476 /*
2477  * call-seq:
2478  * str.index(substring [, offset]) -> fixnum or nil
2479  * str.index(regexp [, offset]) -> fixnum or nil
2480  *
2481  * Returns the index of the first occurrence of the given <i>substring</i> or
2482  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2483  * found. If the second parameter is present, it specifies the position in the
2484  * string to begin the search.
2485  *
2486  * "hello".index('e') #=> 1
2487  * "hello".index('lo') #=> 3
2488  * "hello".index('a') #=> nil
2489  * "hello".index(?e) #=> 1
2490  * "hello".index(/[aeiou]/, -3) #=> 4
2491  */
2492 
2493 static VALUE
2495 {
2496  VALUE sub;
2497  VALUE initpos;
2498  long pos;
2499 
2500  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2501  pos = NUM2LONG(initpos);
2502  }
2503  else {
2504  pos = 0;
2505  }
2506  if (pos < 0) {
2507  pos += str_strlen(str, STR_ENC_GET(str));
2508  if (pos < 0) {
2509  if (TYPE(sub) == T_REGEXP) {
2511  }
2512  return Qnil;
2513  }
2514  }
2515 
2516  switch (TYPE(sub)) {
2517  case T_REGEXP:
2518  if (pos > str_strlen(str, STR_ENC_GET(str)))
2519  return Qnil;
2520  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2521  rb_enc_check(str, sub), single_byte_optimizable(str));
2522 
2523  pos = rb_reg_search(sub, str, pos, 0);
2524  pos = rb_str_sublen(str, pos);
2525  break;
2526 
2527  default: {
2528  VALUE tmp;
2529 
2530  tmp = rb_check_string_type(sub);
2531  if (NIL_P(tmp)) {
2532  rb_raise(rb_eTypeError, "type mismatch: %s given",
2533  rb_obj_classname(sub));
2534  }
2535  sub = tmp;
2536  }
2537  /* fall through */
2538  case T_STRING:
2539  pos = rb_str_index(str, sub, pos);
2540  pos = rb_str_sublen(str, pos);
2541  break;
2542  }
2543 
2544  if (pos == -1) return Qnil;
2545  return LONG2NUM(pos);
2546 }
2547 
2548 static long
2549 rb_str_rindex(VALUE str, VALUE sub, long pos)
2550 {
2551  long len, slen;
2552  char *s, *sbeg, *e, *t;
2553  rb_encoding *enc;
2554  int singlebyte = single_byte_optimizable(str);
2555 
2556  enc = rb_enc_check(str, sub);
2557  if (is_broken_string(sub)) {
2558  return -1;
2559  }
2560  len = str_strlen(str, enc);
2561  slen = str_strlen(sub, enc);
2562  /* substring longer than string */
2563  if (len < slen) return -1;
2564  if (len - pos < slen) {
2565  pos = len - slen;
2566  }
2567  if (len == 0) {
2568  return pos;
2569  }
2570  sbeg = RSTRING_PTR(str);
2571  e = RSTRING_END(str);
2572  t = RSTRING_PTR(sub);
2573  slen = RSTRING_LEN(sub);
2574  s = str_nth(sbeg, e, pos, enc, singlebyte);
2575  while (s) {
2576  if (memcmp(s, t, slen) == 0) {
2577  return pos;
2578  }
2579  if (pos == 0) break;
2580  pos--;
2581  s = rb_enc_prev_char(sbeg, s, e, enc);
2582  }
2583  return -1;
2584 }
2585 
2586 
2587 /*
2588  * call-seq:
2589  * str.rindex(substring [, fixnum]) -> fixnum or nil
2590  * str.rindex(regexp [, fixnum]) -> fixnum or nil
2591  *
2592  * Returns the index of the last occurrence of the given <i>substring</i> or
2593  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2594  * found. If the second parameter is present, it specifies the position in the
2595  * string to end the search---characters beyond this point will not be
2596  * considered.
2597  *
2598  * "hello".rindex('e') #=> 1
2599  * "hello".rindex('l') #=> 3
2600  * "hello".rindex('a') #=> nil
2601  * "hello".rindex(?e) #=> 1
2602  * "hello".rindex(/[aeiou]/, -2) #=> 1
2603  */
2604 
2605 static VALUE
2607 {
2608  VALUE sub;
2609  VALUE vpos;
2610  rb_encoding *enc = STR_ENC_GET(str);
2611  long pos, len = str_strlen(str, enc);
2612 
2613  if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2614  pos = NUM2LONG(vpos);
2615  if (pos < 0) {
2616  pos += len;
2617  if (pos < 0) {
2618  if (TYPE(sub) == T_REGEXP) {
2620  }
2621  return Qnil;
2622  }
2623  }
2624  if (pos > len) pos = len;
2625  }
2626  else {
2627  pos = len;
2628  }
2629 
2630  switch (TYPE(sub)) {
2631  case T_REGEXP:
2632  /* enc = rb_get_check(str, sub); */
2633  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2635 
2636  if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2637  pos = rb_reg_search(sub, str, pos, 1);
2638  pos = rb_str_sublen(str, pos);
2639  }
2640  if (pos >= 0) return LONG2NUM(pos);
2641  break;
2642 
2643  default: {
2644  VALUE tmp;
2645 
2646  tmp = rb_check_string_type(sub);
2647  if (NIL_P(tmp)) {
2648  rb_raise(rb_eTypeError, "type mismatch: %s given",
2649  rb_obj_classname(sub));
2650  }
2651  sub = tmp;
2652  }
2653  /* fall through */
2654  case T_STRING:
2655  pos = rb_str_rindex(str, sub, pos);
2656  if (pos >= 0) return LONG2NUM(pos);
2657  break;
2658  }
2659  return Qnil;
2660 }
2661 
2662 /*
2663  * call-seq:
2664  * str =~ obj -> fixnum or nil
2665  *
2666  * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2667  * against <i>str</i>,and returns the position the match starts, or
2668  * <code>nil</code> if there is no match. Otherwise, invokes
2669  * <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2670  * <code>=~</code> in <code>Object</code> returns <code>nil</code>.
2671  *
2672  * "cat o' 9 tails" =~ /\d/ #=> 7
2673  * "cat o' 9 tails" =~ 9 #=> nil
2674  */
2675 
2676 static VALUE
2678 {
2679  switch (TYPE(y)) {
2680  case T_STRING:
2681  rb_raise(rb_eTypeError, "type mismatch: String given");
2682 
2683  case T_REGEXP:
2684  return rb_reg_match(y, x);
2685 
2686  default:
2687  return rb_funcall(y, rb_intern("=~"), 1, x);
2688  }
2689 }
2690 
2691 
2692 static VALUE get_pat(VALUE, int);
2693 
2694 
2695 /*
2696  * call-seq:
2697  * str.match(pattern) -> matchdata or nil
2698  * str.match(pattern, pos) -> matchdata or nil
2699  *
2700  * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2701  * then invokes its <code>match</code> method on <i>str</i>. If the second
2702  * parameter is present, it specifies the position in the string to begin the
2703  * search.
2704  *
2705  * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l">
2706  * 'hello'.match('(.)\1')[0] #=> "ll"
2707  * 'hello'.match(/(.)\1/)[0] #=> "ll"
2708  * 'hello'.match('xx') #=> nil
2709  *
2710  * If a block is given, invoke the block with MatchData if match succeed, so
2711  * that you can write
2712  *
2713  * str.match(pat) {|m| ...}
2714  *
2715  * instead of
2716  *
2717  * if m = str.match(pat)
2718  * ...
2719  * end
2720  *
2721  * The return value is a value from block execution in this case.
2722  */
2723 
2724 static VALUE
2726 {
2727  VALUE re, result;
2728  if (argc < 1)
2729  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
2730  re = argv[0];
2731  argv[0] = str;
2732  result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2733  if (!NIL_P(result) && rb_block_given_p()) {
2734  return rb_yield(result);
2735  }
2736  return result;
2737 }
2738 
2743 };
2744 
2745 static enum neighbor_char
2746 enc_succ_char(char *p, long len, rb_encoding *enc)
2747 {
2748  long i;
2749  int l;
2750  while (1) {
2751  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2752  p[i] = '\0';
2753  if (i < 0)
2754  return NEIGHBOR_WRAPPED;
2755  ++((unsigned char*)p)[i];
2756  l = rb_enc_precise_mbclen(p, p+len, enc);
2757  if (MBCLEN_CHARFOUND_P(l)) {
2758  l = MBCLEN_CHARFOUND_LEN(l);
2759  if (l == len) {
2760  return NEIGHBOR_FOUND;
2761  }
2762  else {
2763  memset(p+l, 0xff, len-l);
2764  }
2765  }
2766  if (MBCLEN_INVALID_P(l) && i < len-1) {
2767  long len2;
2768  int l2;
2769  for (len2 = len-1; 0 < len2; len2--) {
2770  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2771  if (!MBCLEN_INVALID_P(l2))
2772  break;
2773  }
2774  memset(p+len2+1, 0xff, len-(len2+1));
2775  }
2776  }
2777 }
2778 
2779 static enum neighbor_char
2780 enc_pred_char(char *p, long len, rb_encoding *enc)
2781 {
2782  long i;
2783  int l;
2784  while (1) {
2785  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2786  p[i] = '\xff';
2787  if (i < 0)
2788  return NEIGHBOR_WRAPPED;
2789  --((unsigned char*)p)[i];
2790  l = rb_enc_precise_mbclen(p, p+len, enc);
2791  if (MBCLEN_CHARFOUND_P(l)) {
2792  l = MBCLEN_CHARFOUND_LEN(l);
2793  if (l == len) {
2794  return NEIGHBOR_FOUND;
2795  }
2796  else {
2797  memset(p+l, 0, len-l);
2798  }
2799  }
2800  if (MBCLEN_INVALID_P(l) && i < len-1) {
2801  long len2;
2802  int l2;
2803  for (len2 = len-1; 0 < len2; len2--) {
2804  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2805  if (!MBCLEN_INVALID_P(l2))
2806  break;
2807  }
2808  memset(p+len2+1, 0, len-(len2+1));
2809  }
2810  }
2811 }
2812 
2813 /*
2814  overwrite +p+ by succeeding letter in +enc+ and returns
2815  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2816  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2817  assuming each ranges are successive, and mbclen
2818  never change in each ranges.
2819  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2820  character.
2821  */
2822 static enum neighbor_char
2823 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
2824 {
2825  enum neighbor_char ret;
2826  unsigned int c;
2827  int ctype;
2828  int range;
2829  char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2830 
2831  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2832  if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2833  ctype = ONIGENC_CTYPE_DIGIT;
2834  else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2835  ctype = ONIGENC_CTYPE_ALPHA;
2836  else
2837  return NEIGHBOR_NOT_CHAR;
2838 
2839  MEMCPY(save, p, char, len);
2840  ret = enc_succ_char(p, len, enc);
2841  if (ret == NEIGHBOR_FOUND) {
2842  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2843  if (rb_enc_isctype(c, ctype, enc))
2844  return NEIGHBOR_FOUND;
2845  }
2846  MEMCPY(p, save, char, len);
2847  range = 1;
2848  while (1) {
2849  MEMCPY(save, p, char, len);
2850  ret = enc_pred_char(p, len, enc);
2851  if (ret == NEIGHBOR_FOUND) {
2852  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2853  if (!rb_enc_isctype(c, ctype, enc)) {
2854  MEMCPY(p, save, char, len);
2855  break;
2856  }
2857  }
2858  else {
2859  MEMCPY(p, save, char, len);
2860  break;
2861  }
2862  range++;
2863  }
2864  if (range == 1) {
2865  return NEIGHBOR_NOT_CHAR;
2866  }
2867 
2868  if (ctype != ONIGENC_CTYPE_DIGIT) {
2869  MEMCPY(carry, p, char, len);
2870  return NEIGHBOR_WRAPPED;
2871  }
2872 
2873  MEMCPY(carry, p, char, len);
2874  enc_succ_char(carry, len, enc);
2875  return NEIGHBOR_WRAPPED;
2876 }
2877 
2878 
2879 /*
2880  * call-seq:
2881  * str.succ -> new_str
2882  * str.next -> new_str
2883  *
2884  * Returns the successor to <i>str</i>. The successor is calculated by
2885  * incrementing characters starting from the rightmost alphanumeric (or
2886  * the rightmost character if there are no alphanumerics) in the
2887  * string. Incrementing a digit always results in another digit, and
2888  * incrementing a letter results in another letter of the same case.
2889  * Incrementing nonalphanumerics uses the underlying character set's
2890  * collating sequence.
2891  *
2892  * If the increment generates a ``carry,'' the character to the left of
2893  * it is incremented. This process repeats until there is no carry,
2894  * adding an additional character if necessary.
2895  *
2896  * "abcd".succ #=> "abce"
2897  * "THX1138".succ #=> "THX1139"
2898  * "<<koala>>".succ #=> "<<koalb>>"
2899  * "1999zzz".succ #=> "2000aaa"
2900  * "ZZZ9999".succ #=> "AAAA0000"
2901  * "***".succ #=> "**+"
2902  */
2903 
2904 VALUE
2906 {
2907  rb_encoding *enc;
2908  VALUE str;
2909  char *sbeg, *s, *e, *last_alnum = 0;
2910  int c = -1;
2911  long l;
2912  char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2913  long carry_pos = 0, carry_len = 1;
2914  enum neighbor_char neighbor = NEIGHBOR_FOUND;
2915 
2916  str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2917  rb_enc_cr_str_copy_for_substr(str, orig);
2918  OBJ_INFECT(str, orig);
2919  if (RSTRING_LEN(str) == 0) return str;
2920 
2921  enc = STR_ENC_GET(orig);
2922  sbeg = RSTRING_PTR(str);
2923  s = e = sbeg + RSTRING_LEN(str);
2924 
2925  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
2926  if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
2927  if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
2928  ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
2929  s = last_alnum;
2930  break;
2931  }
2932  }
2933  if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2934  neighbor = enc_succ_alnum_char(s, l, enc, carry);
2935  switch (neighbor) {
2936  case NEIGHBOR_NOT_CHAR:
2937  continue;
2938  case NEIGHBOR_FOUND:
2939  return str;
2940  case NEIGHBOR_WRAPPED:
2941  last_alnum = s;
2942  break;
2943  }
2944  c = 1;
2945  carry_pos = s - sbeg;
2946  carry_len = l;
2947  }
2948  if (c == -1) { /* str contains no alnum */
2949  s = e;
2950  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
2951  enum neighbor_char neighbor;
2952  if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2953  neighbor = enc_succ_char(s, l, enc);
2954  if (neighbor == NEIGHBOR_FOUND)
2955  return str;
2956  if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
2957  /* wrapped to \0...\0. search next valid char. */
2958  enc_succ_char(s, l, enc);
2959  }
2960  if (!rb_enc_asciicompat(enc)) {
2961  MEMCPY(carry, s, char, l);
2962  carry_len = l;
2963  }
2964  carry_pos = s - sbeg;
2965  }
2966  }
2967  RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
2968  s = RSTRING_PTR(str) + carry_pos;
2969  memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
2970  memmove(s, carry, carry_len);
2971  STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
2972  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2973  rb_enc_str_coderange(str);
2974  return str;
2975 }
2976 
2977 
2978 /*
2979  * call-seq:
2980  * str.succ! -> str
2981  * str.next! -> str
2982  *
2983  * Equivalent to <code>String#succ</code>, but modifies the receiver in
2984  * place.
2985  */
2986 
2987 static VALUE
2989 {
2991 
2992  return str;
2993 }
2994 
2995 
2996 /*
2997  * call-seq:
2998  * str.upto(other_str, exclusive=false) {|s| block } -> str
2999  * str.upto(other_str, exclusive=false) -> an_enumerator
3000  *
3001  * Iterates through successive values, starting at <i>str</i> and
3002  * ending at <i>other_str</i> inclusive, passing each value in turn to
3003  * the block. The <code>String#succ</code> method is used to generate
3004  * each value. If optional second argument exclusive is omitted or is false,
3005  * the last value will be included; otherwise it will be excluded.
3006  *
3007  * If no block is given, an enumerator is returned instead.
3008  *
3009  * "a8".upto("b6") {|s| print s, ' ' }
3010  * for s in "a8".."b6"
3011  * print s, ' '
3012  * end
3013  *
3014  * <em>produces:</em>
3015  *
3016  * a8 a9 b0 b1 b2 b3 b4 b5 b6
3017  * a8 a9 b0 b1 b2 b3 b4 b5 b6
3018  *
3019  * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
3020  * both are recognized as decimal numbers. In addition, the width of
3021  * string (e.g. leading zeros) is handled appropriately.
3022  *
3023  * "9".upto("11").to_a #=> ["9", "10", "11"]
3024  * "25".upto("5").to_a #=> []
3025  * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"]
3026  */
3027 
3028 static VALUE
3030 {
3031  VALUE end, exclusive;
3032  VALUE current, after_end;
3033  ID succ;
3034  int n, excl, ascii;
3035  rb_encoding *enc;
3036 
3037  rb_scan_args(argc, argv, "11", &end, &exclusive);
3038  RETURN_ENUMERATOR(beg, argc, argv);
3039  excl = RTEST(exclusive);
3040  CONST_ID(succ, "succ");
3041  StringValue(end);
3042  enc = rb_enc_check(beg, end);
3043  ascii = (is_ascii_string(beg) && is_ascii_string(end));
3044  /* single character */
3045  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
3046  char c = RSTRING_PTR(beg)[0];
3047  char e = RSTRING_PTR(end)[0];
3048 
3049  if (c > e || (excl && c == e)) return beg;
3050  for (;;) {
3051  rb_yield(rb_enc_str_new(&c, 1, enc));
3052  if (!excl && c == e) break;
3053  c++;
3054  if (excl && c == e) break;
3055  }
3056  return beg;
3057  }
3058  /* both edges are all digits */
3059  if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
3060  char *s, *send;
3061  VALUE b, e;
3062  int width;
3063 
3064  s = RSTRING_PTR(beg); send = RSTRING_END(beg);
3065  width = rb_long2int(send - s);
3066  while (s < send) {
3067  if (!ISDIGIT(*s)) goto no_digits;
3068  s++;
3069  }
3070  s = RSTRING_PTR(end); send = RSTRING_END(end);
3071  while (s < send) {
3072  if (!ISDIGIT(*s)) goto no_digits;
3073  s++;
3074  }
3075  b = rb_str_to_inum(beg, 10, FALSE);
3076  e = rb_str_to_inum(end, 10, FALSE);
3077  if (FIXNUM_P(b) && FIXNUM_P(e)) {
3078  long bi = FIX2LONG(b);
3079  long ei = FIX2LONG(e);
3080  rb_encoding *usascii = rb_usascii_encoding();
3081 
3082  while (bi <= ei) {
3083  if (excl && bi == ei) break;
3084  rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
3085  bi++;
3086  }
3087  }
3088  else {
3089  ID op = excl ? '<' : rb_intern("<=");
3090  VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
3091 
3092  args[0] = INT2FIX(width);
3093  while (rb_funcall(b, op, 1, e)) {
3094  args[1] = b;
3095  rb_yield(rb_str_format(numberof(args), args, fmt));
3096  b = rb_funcall(b, succ, 0, 0);
3097  }
3098  }
3099  return beg;
3100  }
3101  /* normal case */
3102  no_digits:
3103  n = rb_str_cmp(beg, end);
3104  if (n > 0 || (excl && n == 0)) return beg;
3105 
3106  after_end = rb_funcall(end, succ, 0, 0);
3107  current = rb_str_dup(beg);
3108  while (!rb_str_equal(current, after_end)) {
3109  VALUE next = Qnil;
3110  if (excl || !rb_str_equal(current, end))
3111  next = rb_funcall(current, succ, 0, 0);
3112  rb_yield(current);
3113  if (NIL_P(next)) break;
3114  current = next;
3115  StringValue(current);
3116  if (excl && rb_str_equal(current, end)) break;
3117  if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
3118  break;
3119  }
3120 
3121  return beg;
3122 }
3123 
3124 static VALUE
3125 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
3126 {
3127  if (rb_reg_search(re, str, 0, 0) >= 0) {
3129  int nth = rb_reg_backref_number(match, backref);
3130  return rb_reg_nth_match(nth, match);
3131  }
3132  return Qnil;
3133 }
3134 
3135 static VALUE
3137 {
3138  long idx;
3139 
3140  switch (TYPE(indx)) {
3141  case T_FIXNUM:
3142  idx = FIX2LONG(indx);
3143 
3144  num_index:
3145  str = rb_str_substr(str, idx, 1);
3146  if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
3147  return str;
3148 
3149  case T_REGEXP:
3150  return rb_str_subpat(str, indx, INT2FIX(0));
3151 
3152  case T_STRING:
3153  if (rb_str_index(str, indx, 0) != -1)
3154  return rb_str_dup(indx);
3155  return Qnil;
3156 
3157  default:
3158  /* check if indx is Range */
3159  {
3160  long beg, len;
3161  VALUE tmp;
3162 
3163  len = str_strlen(str, STR_ENC_GET(str));
3164  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
3165  case Qfalse:
3166  break;
3167  case Qnil:
3168  return Qnil;
3169  default:
3170  tmp = rb_str_substr(str, beg, len);
3171  return tmp;
3172  }
3173  }
3174  idx = NUM2LONG(indx);
3175  goto num_index;
3176  }
3177  return Qnil; /* not reached */
3178 }
3179 
3180 
3181 /*
3182  * call-seq:
3183  * str[fixnum] -> new_str or nil
3184  * str[fixnum, fixnum] -> new_str or nil
3185  * str[range] -> new_str or nil
3186  * str[regexp] -> new_str or nil
3187  * str[regexp, fixnum] -> new_str or nil
3188  * str[other_str] -> new_str or nil
3189  * str.slice(fixnum) -> new_str or nil
3190  * str.slice(fixnum, fixnum) -> new_str or nil
3191  * str.slice(range) -> new_str or nil
3192  * str.slice(regexp) -> new_str or nil
3193  * str.slice(regexp, fixnum) -> new_str or nil
3194  * str.slice(regexp, capname) -> new_str or nil
3195  * str.slice(other_str) -> new_str or nil
3196  *
3197  * Element Reference---If passed a single <code>Fixnum</code>, returns a
3198  * substring of one character at that position. If passed two <code>Fixnum</code>
3199  * objects, returns a substring starting at the offset given by the first, and
3200  * with a length given by the second. If passed a range, its beginning and end
3201  * are interpreted as offsets delimiting the substring to be returned. In all
3202  * three cases, if an offset is negative, it is counted from the end of <i>str</i>.
3203  * Returns <code>nil</code> if the initial offset falls outside the string or
3204  * the length is negative.
3205  *
3206  * If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
3207  * returned. If a numeric or name parameter follows the regular expression, that
3208  * component of the <code>MatchData</code> is returned instead. If a
3209  * <code>String</code> is given, that string is returned if it occurs in
3210  * <i>str</i>. In both cases, <code>nil</code> is returned if there is no
3211  * match.
3212  *
3213  * a = "hello there"
3214  * a[1] #=> "e"
3215  * a[2, 3] #=> "llo"
3216  * a[2..3] #=> "ll"
3217  * a[-3, 2] #=> "er"
3218  * a[7..-2] #=> "her"
3219  * a[-4..-2] #=> "her"
3220  * a[-2..-4] #=> ""
3221  * a[12..-1] #=> nil
3222  * a[/[aeiou](.)\1/] #=> "ell"
3223  * a[/[aeiou](.)\1/, 0] #=> "ell"
3224  * a[/[aeiou](.)\1/, 1] #=> "l"
3225  * a[/[aeiou](.)\1/, 2] #=> nil
3226  * a["lo"] #=> "lo"
3227  * a["bye"] #=> nil
3228  */
3229 
3230 static VALUE
3232 {
3233  if (argc == 2) {
3234  if (TYPE(argv[0]) == T_REGEXP) {
3235  return rb_str_subpat(str, argv[0], argv[1]);
3236  }
3237  return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
3238  }
3239  if (argc != 1) {
3240  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3241  }
3242  return rb_str_aref(str, argv[0]);
3243 }
3244 
3245 VALUE
3246 rb_str_drop_bytes(VALUE str, long len)
3247 {
3248  char *ptr = RSTRING_PTR(str);
3249  long olen = RSTRING_LEN(str), nlen;
3250 
3251  str_modifiable(str);
3252  if (len > olen) len = olen;
3253  nlen = olen - len;
3254  if (nlen <= RSTRING_EMBED_LEN_MAX) {
3255  char *oldptr = ptr;
3256  int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
3257  STR_SET_EMBED(str);
3258  STR_SET_EMBED_LEN(str, nlen);
3259  ptr = RSTRING(str)->as.ary;
3260  memmove(ptr, oldptr + len, nlen);
3261  if (fl == STR_NOEMBED) xfree(oldptr);
3262  }
3263  else {
3264  if (!STR_SHARED_P(str)) rb_str_new4(str);
3265  ptr = RSTRING(str)->as.heap.ptr += len;
3266  RSTRING(str)->as.heap.len = nlen;
3267  }
3268  ptr[nlen] = 0;
3269  ENC_CODERANGE_CLEAR(str);
3270  return str;
3271 }
3272 
3273 static void
3274 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
3275 {
3276  if (beg == 0 && RSTRING_LEN(val) == 0) {
3277  rb_str_drop_bytes(str, len);
3278  OBJ_INFECT(str, val);
3279  return;
3280  }
3281 
3282  rb_str_modify(str);
3283  if (len < RSTRING_LEN(val)) {
3284  /* expand string */
3285  RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
3286  }
3287 
3288  if (RSTRING_LEN(val) != len) {
3289  memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
3290  RSTRING_PTR(str) + beg + len,
3291  RSTRING_LEN(str) - (beg + len));
3292  }
3293  if (RSTRING_LEN(val) < beg && len < 0) {
3294  MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
3295  }
3296  if (RSTRING_LEN(val) > 0) {
3297  memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
3298  }
3299  STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
3300  if (RSTRING_PTR(str)) {
3301  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3302  }
3303  OBJ_INFECT(str, val);
3304 }
3305 
3306 static void
3307 rb_str_splice(VALUE str, long beg, long len, VALUE val)
3308 {
3309  long slen;
3310  char *p, *e;
3311  rb_encoding *enc;
3312  int singlebyte = single_byte_optimizable(str);
3313  int cr;
3314 
3315  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
3316 
3317  StringValue(val);
3318  enc = rb_enc_check(str, val);
3319  slen = str_strlen(str, enc);
3320 
3321  if (slen < beg) {
3322  out_of_range:
3323  rb_raise(rb_eIndexError, "index %ld out of string", beg);
3324  }
3325  if (beg < 0) {
3326  if (-beg > slen) {
3327  goto out_of_range;
3328  }
3329  beg += slen;
3330  }
3331  if (slen < len || slen < beg + len) {
3332  len = slen - beg;
3333  }
3334  str_modify_keep_cr(str);
3335  p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
3336  if (!p) p = RSTRING_END(str);
3337  e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
3338  if (!e) e = RSTRING_END(str);
3339  /* error check */
3340  beg = p - RSTRING_PTR(str); /* physical position */
3341  len = e - p; /* physical length */
3342  rb_str_splice_0(str, beg, len, val);
3343  rb_enc_associate(str, enc);
3345  if (cr != ENC_CODERANGE_BROKEN)
3346  ENC_CODERANGE_SET(str, cr);
3347 }
3348 
3349 void
3350 rb_str_update(VALUE str, long beg, long len, VALUE val)
3351 {
3352  rb_str_splice(str, beg, len, val);
3353 }
3354 
3355 static void
3356 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
3357 {
3358  int nth;
3359  VALUE match;
3360  long start, end, len;
3361  rb_encoding *enc;
3362  struct re_registers *regs;
3363 
3364  if (rb_reg_search(re, str, 0, 0) < 0) {
3365  rb_raise(rb_eIndexError, "regexp not matched");
3366  }
3367  match = rb_backref_get();
3368  nth = rb_reg_backref_number(match, backref);
3369  regs = RMATCH_REGS(match);
3370  if (nth >= regs->num_regs) {
3371  out_of_range:
3372  rb_raise(rb_eIndexError, "index %d out of regexp", nth);
3373  }
3374  if (nth < 0) {
3375  if (-nth >= regs->num_regs) {
3376  goto out_of_range;
3377  }
3378  nth += regs->num_regs;
3379  }
3380 
3381  start = BEG(nth);
3382  if (start == -1) {
3383  rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
3384  }
3385  end = END(nth);
3386  len = end - start;
3387  StringValue(val);
3388  enc = rb_enc_check(str, val);
3389  rb_str_splice_0(str, start, len, val);
3390  rb_enc_associate(str, enc);
3391 }
3392 
3393 static VALUE
3394 rb_str_aset(VALUE str, VALUE indx, VALUE val)
3395 {
3396  long idx, beg;
3397 
3398  switch (TYPE(indx)) {
3399  case T_FIXNUM:
3400  idx = FIX2LONG(indx);
3401  num_index:
3402  rb_str_splice(str, idx, 1, val);
3403  return val;
3404 
3405  case T_REGEXP:
3406  rb_str_subpat_set(str, indx, INT2FIX(0), val);
3407  return val;
3408 
3409  case T_STRING:
3410  beg = rb_str_index(str, indx, 0);
3411  if (beg < 0) {
3412  rb_raise(rb_eIndexError, "string not matched");
3413  }
3414  beg = rb_str_sublen(str, beg);
3415  rb_str_splice(str, beg, str_strlen(indx, 0), val);
3416  return val;
3417 
3418  default:
3419  /* check if indx is Range */
3420  {
3421  long beg, len;
3422  if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
3423  rb_str_splice(str, beg, len, val);
3424  return val;
3425  }
3426  }
3427  idx = NUM2LONG(indx);
3428  goto num_index;
3429  }
3430 }
3431 
3432 /*
3433  * call-seq:
3434  * str[fixnum] = new_str
3435  * str[fixnum, fixnum] = new_str
3436  * str[range] = aString
3437  * str[regexp] = new_str
3438  * str[regexp, fixnum] = new_str
3439  * str[regexp, name] = new_str
3440  * str[other_str] = new_str
3441  *
3442  * Element Assignment---Replaces some or all of the content of <i>str</i>. The
3443  * portion of the string affected is determined using the same criteria as
3444  * <code>String#[]</code>. If the replacement string is not the same length as
3445  * the text it is replacing, the string will be adjusted accordingly. If the
3446  * regular expression or string is used as the index doesn't match a position
3447  * in the string, <code>IndexError</code> is raised. If the regular expression
3448  * form is used, the optional second <code>Fixnum</code> allows you to specify
3449  * which portion of the match to replace (effectively using the
3450  * <code>MatchData</code> indexing rules. The forms that take a
3451  * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
3452  * out of range; the <code>Range</code> form will raise a
3453  * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
3454  * forms will silently ignore the assignment.
3455  */
3456 
3457 static VALUE
3459 {
3460  if (argc == 3) {
3461  if (TYPE(argv[0]) == T_REGEXP) {
3462  rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
3463  }
3464  else {
3465  rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
3466  }
3467  return argv[2];
3468  }
3469  if (argc != 2) {
3470  rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
3471  }
3472  return rb_str_aset(str, argv[0], argv[1]);
3473 }
3474 
3475 /*
3476  * call-seq:
3477  * str.insert(index, other_str) -> str
3478  *
3479  * Inserts <i>other_str</i> before the character at the given
3480  * <i>index</i>, modifying <i>str</i>. Negative indices count from the
3481  * end of the string, and insert <em>after</em> the given character.
3482  * The intent is insert <i>aString</i> so that it starts at the given
3483  * <i>index</i>.
3484  *
3485  * "abcd".insert(0, 'X') #=> "Xabcd"
3486  * "abcd".insert(3, 'X') #=> "abcXd"
3487  * "abcd".insert(4, 'X') #=> "abcdX"
3488  * "abcd".insert(-3, 'X') #=> "abXcd"
3489  * "abcd".insert(-1, 'X') #=> "abcdX"
3490  */
3491 
3492 static VALUE
3494 {
3495  long pos = NUM2LONG(idx);
3496 
3497  if (pos == -1) {
3498  return rb_str_append(str, str2);
3499  }
3500  else if (pos < 0) {
3501  pos++;
3502  }
3503  rb_str_splice(str, pos, 0, str2);
3504  return str;
3505 }
3506 
3507 
3508 /*
3509  * call-seq:
3510  * str.slice!(fixnum) -> fixnum or nil
3511  * str.slice!(fixnum, fixnum) -> new_str or nil
3512  * str.slice!(range) -> new_str or nil
3513  * str.slice!(regexp) -> new_str or nil
3514  * str.slice!(other_str) -> new_str or nil
3515  *
3516  * Deletes the specified portion from <i>str</i>, and returns the portion
3517  * deleted.
3518  *
3519  * string = "this is a string"
3520  * string.slice!(2) #=> "i"
3521  * string.slice!(3..6) #=> " is "
3522  * string.slice!(/s.*t/) #=> "sa st"
3523  * string.slice!("r") #=> "r"
3524  * string #=> "thing"
3525  */
3526 
3527 static VALUE
3529 {
3530  VALUE result;
3531  VALUE buf[3];
3532  int i;
3533 
3534  if (argc < 1 || 2 < argc) {
3535  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3536  }
3537  for (i=0; i<argc; i++) {
3538  buf[i] = argv[i];
3539  }
3540  str_modify_keep_cr(str);
3541  result = rb_str_aref_m(argc, buf, str);
3542  if (!NIL_P(result)) {
3543  buf[i] = rb_str_new(0,0);
3544  rb_str_aset_m(argc+1, buf, str);
3545  }
3546  return result;
3547 }
3548 
3549 static VALUE
3550 get_pat(VALUE pat, int quote)
3551 {
3552  VALUE val;
3553 
3554  switch (TYPE(pat)) {
3555  case T_REGEXP:
3556  return pat;
3557 
3558  case T_STRING:
3559  break;
3560 
3561  default:
3562  val = rb_check_string_type(pat);
3563  if (NIL_P(val)) {
3564  Check_Type(pat, T_REGEXP);
3565  }
3566  pat = val;
3567  }
3568 
3569  if (quote) {
3570  pat = rb_reg_quote(pat);
3571  }
3572 
3573  return rb_reg_regcomp(pat);
3574 }
3575 
3576 
3577 /*
3578  * call-seq:
3579  * str.sub!(pattern, replacement) -> str or nil
3580  * str.sub!(pattern) {|match| block } -> str or nil
3581  *
3582  * Performs the substitutions of <code>String#sub</code> in place,
3583  * returning <i>str</i>, or <code>nil</code> if no substitutions were
3584  * performed.
3585  */
3586 
3587 static VALUE
3589 {
3590  VALUE pat, repl, hash = Qnil;
3591  int iter = 0;
3592  int tainted = 0;
3593  int untrusted = 0;
3594  long plen;
3595 
3596  if (argc == 1 && rb_block_given_p()) {
3597  iter = 1;
3598  }
3599  else if (argc == 2) {
3600  repl = argv[1];
3601  hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3602  if (NIL_P(hash)) {
3603  StringValue(repl);
3604  }
3605  if (OBJ_TAINTED(repl)) tainted = 1;
3606  if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3607  }
3608  else {
3609  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3610  }
3611 
3612  pat = get_pat(argv[0], 1);
3613  str_modifiable(str);
3614  if (rb_reg_search(pat, str, 0, 0) >= 0) {
3615  rb_encoding *enc;
3616  int cr = ENC_CODERANGE(str);
3618  struct re_registers *regs = RMATCH_REGS(match);
3619  long beg0 = BEG(0);
3620  long end0 = END(0);
3621  char *p, *rp;
3622  long len, rlen;
3623 
3624  if (iter || !NIL_P(hash)) {
3625  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3626 
3627  if (iter) {
3628  repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3629  }
3630  else {
3631  repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3632  repl = rb_obj_as_string(repl);
3633  }
3634  str_mod_check(str, p, len);
3635  rb_check_frozen(str);
3636  }
3637  else {
3638  repl = rb_reg_regsub(repl, str, regs, pat);
3639  }
3640  enc = rb_enc_compatible(str, repl);
3641  if (!enc) {
3642  rb_encoding *str_enc = STR_ENC_GET(str);
3643  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3644  if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
3645  coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
3646  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3647  rb_enc_name(str_enc),
3648  rb_enc_name(STR_ENC_GET(repl)));
3649  }
3650  enc = STR_ENC_GET(repl);
3651  }
3652  rb_str_modify(str);
3653  rb_enc_associate(str, enc);
3654  if (OBJ_TAINTED(repl)) tainted = 1;
3655  if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3656  if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3657  int cr2 = ENC_CODERANGE(repl);
3658  if (cr2 == ENC_CODERANGE_BROKEN ||
3659  (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
3660  cr = ENC_CODERANGE_UNKNOWN;
3661  else
3662  cr = cr2;
3663  }
3664  plen = end0 - beg0;
3665  rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
3666  len = RSTRING_LEN(str);
3667  if (rlen > plen) {
3668  RESIZE_CAPA(str, len + rlen - plen);
3669  }
3670  p = RSTRING_PTR(str);
3671  if (rlen != plen) {
3672  memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
3673  }
3674  memcpy(p + beg0, rp, rlen);
3675  len += rlen - plen;
3676  STR_SET_LEN(str, len);
3677  RSTRING_PTR(str)[len] = '\0';
3678  ENC_CODERANGE_SET(str, cr);
3679  if (tainted) OBJ_TAINT(str);
3680  if (untrusted) OBJ_UNTRUST(str);
3681 
3682  return str;
3683  }
3684  return Qnil;
3685 }
3686 
3687 
3688 /*
3689  * call-seq:
3690  * str.sub(pattern, replacement) -> new_str
3691  * str.sub(pattern, hash) -> new_str
3692  * str.sub(pattern) {|match| block } -> new_str
3693  *
3694  * Returns a copy of <i>str</i> with the <em>first</em> occurrence of
3695  * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
3696  * typically a <code>Regexp</code>; if given as a <code>String</code>, any
3697  * regular expression metacharacters it contains will be interpreted
3698  * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
3699  * instead of a digit.
3700  *
3701  * If <i>replacement</i> is a <code>String</code> it will be substituted for
3702  * the matched text. It may contain back-references to the pattern's capture
3703  * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
3704  * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
3705  * double-quoted string, both back-references must be preceded by an
3706  * additional backslash. However, within <i>replacement</i> the special match
3707  * variables, such as <code>&$</code>, will not refer to the current match.
3708  *
3709  * If the second argument is a <code>Hash</code>, and the matched text is one
3710  * of its keys, the corresponding value is the replacement string.
3711  *
3712  * In the block form, the current match string is passed in as a parameter,
3713  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3714  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3715  * returned by the block will be substituted for the match on each call.
3716  *
3717  * The result inherits any tainting in the original string or any supplied
3718  * replacement string.
3719  *
3720  * "hello".sub(/[aeiou]/, '*') #=> "h*llo"
3721  * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
3722  * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello"
3723  * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
3724  * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
3725  * #=> "Is /bin/bash your preferred shell?"
3726  */
3727 
3728 static VALUE
3730 {
3731  str = rb_str_dup(str);
3732  rb_str_sub_bang(argc, argv, str);
3733  return str;
3734 }
3735 
3736 static VALUE
3737 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3738 {
3739  VALUE pat, val, repl, match, dest, hash = Qnil;
3740  struct re_registers *regs;
3741  long beg, n;
3742  long beg0, end0;
3743  long offset, blen, slen, len, last;
3744  int iter = 0;
3745  char *sp, *cp;
3746  int tainted = 0;
3747  rb_encoding *str_enc;
3748 
3749  switch (argc) {
3750  case 1:
3751  RETURN_ENUMERATOR(str, argc, argv);
3752  iter = 1;
3753  break;
3754  case 2:
3755  repl = argv[1];
3756  hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3757  if (NIL_P(hash)) {
3758  StringValue(repl);
3759  }
3760  if (OBJ_TAINTED(repl)) tainted = 1;
3761  break;
3762  default:
3763  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3764  }
3765 
3766  pat = get_pat(argv[0], 1);
3767  beg = rb_reg_search(pat, str, 0, 0);
3768  if (beg < 0) {
3769  if (bang) return Qnil; /* no match, no substitution */
3770  return rb_str_dup(str);
3771  }
3772 
3773  offset = 0;
3774  n = 0;
3775  blen = RSTRING_LEN(str) + 30; /* len + margin */
3776  dest = rb_str_buf_new(blen);
3777  sp = RSTRING_PTR(str);
3778  slen = RSTRING_LEN(str);
3779  cp = sp;
3780  str_enc = STR_ENC_GET(str);
3781  rb_enc_associate(dest, str_enc);
3783 
3784  do {
3785  n++;
3786  match = rb_backref_get();
3787  regs = RMATCH_REGS(match);
3788  beg0 = BEG(0);
3789  end0 = END(0);
3790  if (iter || !NIL_P(hash)) {
3791  if (iter) {
3792  val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3793  }
3794  else {
3795  val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3796  val = rb_obj_as_string(val);
3797  }
3798  str_mod_check(str, sp, slen);
3799  if (val == dest) { /* paranoid check [ruby-dev:24827] */
3800  rb_raise(rb_eRuntimeError, "block should not cheat");
3801  }
3802  }
3803  else {
3804  val = rb_reg_regsub(repl, str, regs, pat);
3805  }
3806 
3807  if (OBJ_TAINTED(val)) tainted = 1;
3808 
3809  len = beg - offset; /* copy pre-match substr */
3810  if (len) {
3811  rb_enc_str_buf_cat(dest, cp, len, str_enc);
3812  }
3813 
3814  rb_str_buf_append(dest, val);
3815 
3816  last = offset;
3817  offset = end0;
3818  if (beg0 == end0) {
3819  /*
3820  * Always consume at least one character of the input string
3821  * in order to prevent infinite loops.
3822  */
3823  if (RSTRING_LEN(str) <= end0) break;
3824  len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3825  rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3826  offset = end0 + len;
3827  }
3828  cp = RSTRING_PTR(str) + offset;
3829  if (offset > RSTRING_LEN(str)) break;
3830  beg = rb_reg_search(pat, str, offset, 0);
3831  } while (beg >= 0);
3832  if (RSTRING_LEN(str) > offset) {
3833  rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3834  }
3835  rb_reg_search(pat, str, last, 0);
3836  if (bang) {
3837  rb_str_shared_replace(str, dest);
3838  }
3839  else {
3840  RBASIC(dest)->klass = rb_obj_class(str);
3841  OBJ_INFECT(dest, str);
3842  str = dest;
3843  }
3844 
3845  if (tainted) OBJ_TAINT(str);
3846  return str;
3847 }
3848 
3849 
3850 /*
3851  * call-seq:
3852  * str.gsub!(pattern, replacement) -> str or nil
3853  * str.gsub!(pattern) {|match| block } -> str or nil
3854  * str.gsub!(pattern) -> an_enumerator
3855  *
3856  * Performs the substitutions of <code>String#gsub</code> in place, returning
3857  * <i>str</i>, or <code>nil</code> if no substitutions were performed.
3858  * If no block and no <i>replacement</i> is given, an enumerator is returned instead.
3859  */
3860 
3861 static VALUE
3863 {
3864  str_modify_keep_cr(str);
3865  return str_gsub(argc, argv, str, 1);
3866 }
3867 
3868 
3869 /*
3870  * call-seq:
3871  * str.gsub(pattern, replacement) -> new_str
3872  * str.gsub(pattern, hash) -> new_str
3873  * str.gsub(pattern) {|match| block } -> new_str
3874  * str.gsub(pattern) -> enumerator
3875  *
3876  * Returns a copy of <i>str</i> with the <em>all</em> occurrences of
3877  * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
3878  * typically a <code>Regexp</code>; if given as a <code>String</code>, any
3879  * regular expression metacharacters it contains will be interpreted
3880  * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
3881  * instead of a digit.
3882  *
3883  * If <i>replacement</i> is a <code>String</code> it will be substituted for
3884  * the matched text. It may contain back-references to the pattern's capture
3885  * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
3886  * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
3887  * double-quoted string, both back-references must be preceded by an
3888  * additional backslash. However, within <i>replacement</i> the special match
3889  * variables, such as <code>&$</code>, will not refer to the current match.
3890  *
3891  * If the second argument is a <code>Hash</code>, and the matched text is one
3892  * of its keys, the corresponding value is the replacement string.
3893  *
3894  * In the block form, the current match string is passed in as a parameter,
3895  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3896  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3897  * returned by the block will be substituted for the match on each call.
3898  *
3899  * The result inherits any tainting in the original string or any supplied
3900  * replacement string.
3901  *
3902  * When neither a block nor a second argument is supplied, an
3903  * <code>Enumerator</code> is returned.
3904  *
3905  * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*"
3906  * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>"
3907  * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 "
3908  * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}"
3909  * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*"
3910  */
3911 
3912 static VALUE
3914 {
3915  return str_gsub(argc, argv, str, 0);
3916 }
3917 
3918 
3919 /*
3920  * call-seq:
3921  * str.replace(other_str) -> str
3922  *
3923  * Replaces the contents and taintedness of <i>str</i> with the corresponding
3924  * values in <i>other_str</i>.
3925  *
3926  * s = "hello" #=> "hello"
3927  * s.replace "world" #=> "world"
3928  */
3929 
3930 VALUE
3932 {
3933  str_modifiable(str);
3934  if (str == str2) return str;
3935 
3936  StringValue(str2);
3937  str_discard(str);
3938  return str_replace(str, str2);
3939 }
3940 
3941 /*
3942  * call-seq:
3943  * string.clear -> string
3944  *
3945  * Makes string empty.
3946  *
3947  * a = "abcde"
3948  * a.clear #=> ""
3949  */
3950 
3951 static VALUE
3953 {
3954  str_discard(str);
3955  STR_SET_EMBED(str);
3956  STR_SET_EMBED_LEN(str, 0);
3957  RSTRING_PTR(str)[0] = 0;
3958  if (rb_enc_asciicompat(STR_ENC_GET(str)))
3960  else
3962  return str;
3963 }
3964 
3965 /*
3966  * call-seq:
3967  * string.chr -> string
3968  *
3969  * Returns a one-character string at the beginning of the string.
3970  *
3971  * a = "abcde"
3972  * a.chr #=> "a"
3973  */
3974 
3975 static VALUE
3977 {
3978  return rb_str_substr(str, 0, 1);
3979 }
3980 
3981 /*
3982  * call-seq:
3983  * str.getbyte(index) -> 0 .. 255
3984  *
3985  * returns the <i>index</i>th byte as an integer.
3986  */
3987 static VALUE
3989 {
3990  long pos = NUM2LONG(index);
3991 
3992  if (pos < 0)
3993  pos += RSTRING_LEN(str);
3994  if (pos < 0 || RSTRING_LEN(str) <= pos)
3995  return Qnil;
3996 
3997  return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
3998 }
3999 
4000 /*
4001  * call-seq:
4002  * str.setbyte(index, int) -> int
4003  *
4004  * modifies the <i>index</i>th byte as <i>int</i>.
4005  */
4006 static VALUE
4007 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
4008 {
4009  long pos = NUM2LONG(index);
4010  int byte = NUM2INT(value);
4011 
4012  rb_str_modify(str);
4013 
4014  if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
4015  rb_raise(rb_eIndexError, "index %ld out of string", pos);
4016  if (pos < 0)
4017  pos += RSTRING_LEN(str);
4018 
4019  RSTRING_PTR(str)[pos] = byte;
4020 
4021  return value;
4022 }
4023 
4024 static VALUE
4025 str_byte_substr(VALUE str, long beg, long len)
4026 {
4027  char *p, *s = RSTRING_PTR(str);
4028  long n = RSTRING_LEN(str);
4029  VALUE str2;
4030 
4031  if (beg > n || len < 0) return Qnil;
4032  if (beg < 0) {
4033  beg += n;
4034  if (beg < 0) return Qnil;
4035  }
4036  if (beg + len > n)
4037  len = n - beg;
4038  if (len <= 0) {
4039  len = 0;
4040  p = 0;
4041  }
4042  else
4043  p = s + beg;
4044 
4045  if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
4046  str2 = rb_str_new4(str);
4047  str2 = str_new3(rb_obj_class(str2), str2);
4048  RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
4049  RSTRING(str2)->as.heap.len = len;
4050  }
4051  else {
4052  str2 = rb_str_new5(str, p, len);
4053  }
4054 
4055  str_enc_copy(str2, str);
4056 
4057  if (RSTRING_LEN(str2) == 0) {
4058  if (!rb_enc_asciicompat(STR_ENC_GET(str)))
4060  else
4062  }
4063  else {
4064  switch (ENC_CODERANGE(str)) {
4065  case ENC_CODERANGE_7BIT:
4067  break;
4068  default:
4070  break;
4071  }
4072  }
4073 
4074  OBJ_INFECT(str2, str);
4075 
4076  return str2;
4077 }
4078 
4079 static VALUE
4081 {
4082  long idx;
4083  switch (TYPE(indx)) {
4084  case T_FIXNUM:
4085  idx = FIX2LONG(indx);
4086 
4087  num_index:
4088  str = str_byte_substr(str, idx, 1);
4089  if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
4090  return str;
4091 
4092  default:
4093  /* check if indx is Range */
4094  {
4095  long beg, len = RSTRING_LEN(str);
4096 
4097  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4098  case Qfalse:
4099  break;
4100  case Qnil:
4101  return Qnil;
4102  default:
4103  return str_byte_substr(str, beg, len);
4104  }
4105  }
4106  idx = NUM2LONG(indx);
4107  goto num_index;
4108  }
4109  return Qnil; /* not reached */
4110 }
4111 
4112 /*
4113  * call-seq:
4114  * str.byteslice(fixnum) -> new_str or nil
4115  * str.byteslice(fixnum, fixnum) -> new_str or nil
4116  * str.byteslice(range) -> new_str or nil
4117  *
4118  * Byte Reference---If passed a single <code>Fixnum</code>, returns a
4119  * substring of one byte at that position. If passed two <code>Fixnum</code>
4120  * objects, returns a substring starting at the offset given by the first, and
4121  * a length given by the second. If given a <code>Range</code>, a substring containing
4122  * bytes at offsets given by the range is returned. In all three cases, if
4123  * an offset is negative, it is counted from the end of <i>str</i>. Returns
4124  * <code>nil</code> if the initial offset falls outside the string, the length
4125  * is negative, or the beginning of the range is greater than the end.
4126  * The encoding of the resulted string keeps original encoding.
4127  *
4128  * "hello".byteslice(1) #=> "e"
4129  * "hello".byteslice(-1) #=> "o"
4130  * "hello".byteslice(1, 2) #=> "el"
4131  * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
4132  * "\x03\u3042\xff".byteslice(1..3) #=> "\u3942"
4133  */
4134 
4135 static VALUE
4137 {
4138  if (argc == 2) {
4139  return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
4140  }
4141  if (argc != 1) {
4142  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
4143  }
4144  return str_byte_aref(str, argv[0]);
4145 }
4146 
4147 /*
4148  * call-seq:
4149  * str.reverse -> new_str
4150  *
4151  * Returns a new string with the characters from <i>str</i> in reverse order.
4152  *
4153  * "stressed".reverse #=> "desserts"
4154  */
4155 
4156 static VALUE
4158 {
4159  rb_encoding *enc;
4160  VALUE rev;
4161  char *s, *e, *p;
4162  int single = 1;
4163 
4164  if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
4165  enc = STR_ENC_GET(str);
4166  rev = rb_str_new5(str, 0, RSTRING_LEN(str));
4167  s = RSTRING_PTR(str); e = RSTRING_END(str);
4168  p = RSTRING_END(rev);
4169 
4170  if (RSTRING_LEN(str) > 1) {
4171  if (single_byte_optimizable(str)) {
4172  while (s < e) {
4173  *--p = *s++;
4174  }
4175  }
4176  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
4177  while (s < e) {
4178  int clen = rb_enc_fast_mbclen(s, e, enc);
4179 
4180  if (clen > 1 || (*s & 0x80)) single = 0;
4181  p -= clen;
4182  memcpy(p, s, clen);
4183  s += clen;
4184  }
4185  }
4186  else {
4187  while (s < e) {
4188  int clen = rb_enc_mbclen(s, e, enc);
4189 
4190  if (clen > 1 || (*s & 0x80)) single = 0;
4191  p -= clen;
4192  memcpy(p, s, clen);
4193  s += clen;
4194  }
4195  }
4196  }
4197  STR_SET_LEN(rev, RSTRING_LEN(str));
4198  OBJ_INFECT(rev, str);
4199  if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
4200  if (single) {
4202  }
4203  else {
4205  }
4206  }
4208 
4209  return rev;
4210 }
4211 
4212 
4213 /*
4214  * call-seq:
4215  * str.reverse! -> str
4216  *
4217  * Reverses <i>str</i> in place.
4218  */
4219 
4220 static VALUE
4222 {
4223  if (RSTRING_LEN(str) > 1) {
4224  if (single_byte_optimizable(str)) {
4225  char *s, *e, c;
4226 
4227  str_modify_keep_cr(str);
4228  s = RSTRING_PTR(str);
4229  e = RSTRING_END(str) - 1;
4230  while (s < e) {
4231  c = *s;
4232  *s++ = *e;
4233  *e-- = c;
4234  }
4235  }
4236  else {
4238  }
4239  }
4240  else {
4241  str_modify_keep_cr(str);
4242  }
4243  return str;
4244 }
4245 
4246 
4247 /*
4248  * call-seq:
4249  * str.include? other_str -> true or false
4250  *
4251  * Returns <code>true</code> if <i>str</i> contains the given string or
4252  * character.
4253  *
4254  * "hello".include? "lo" #=> true
4255  * "hello".include? "ol" #=> false
4256  * "hello".include? ?h #=> true
4257  */
4258 
4259 static VALUE
4261 {
4262  long i;
4263 
4264  StringValue(arg);
4265  i = rb_str_index(str, arg, 0);
4266 
4267  if (i == -1) return Qfalse;
4268  return Qtrue;
4269 }
4270 
4271 
4272 /*
4273  * call-seq:
4274  * str.to_i(base=10) -> integer
4275  *
4276  * Returns the result of interpreting leading characters in <i>str</i> as an
4277  * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
4278  * end of a valid number are ignored. If there is not a valid number at the
4279  * start of <i>str</i>, <code>0</code> is returned. This method never raises an
4280  * exception when <i>base</i> is valid.
4281  *
4282  * "12345".to_i #=> 12345
4283  * "99 red balloons".to_i #=> 99
4284  * "0a".to_i #=> 0
4285  * "0a".to_i(16) #=> 10
4286  * "hello".to_i #=> 0
4287  * "1100101".to_i(2) #=> 101
4288  * "1100101".to_i(8) #=> 294977
4289  * "1100101".to_i(10) #=> 1100101
4290  * "1100101".to_i(16) #=> 17826049
4291  */
4292 
4293 static VALUE
4295 {
4296  int base;
4297 
4298  if (argc == 0) base = 10;
4299  else {
4300  VALUE b;
4301 
4302  rb_scan_args(argc, argv, "01", &b);
4303  base = NUM2INT(b);
4304  }
4305  if (base < 0) {
4306  rb_raise(rb_eArgError, "invalid radix %d", base);
4307  }
4308  return rb_str_to_inum(str, base, FALSE);
4309 }
4310 
4311 
4312 /*
4313  * call-seq:
4314  * str.to_f -> float
4315  *
4316  * Returns the result of interpreting leading characters in <i>str</i> as a
4317  * floating point number. Extraneous characters past the end of a valid number
4318  * are ignored. If there is not a valid number at the start of <i>str</i>,
4319  * <code>0.0</code> is returned. This method never raises an exception.
4320  *
4321  * "123.45e1".to_f #=> 1234.5
4322  * "45.67 degrees".to_f #=> 45.67
4323  * "thx1138".to_f #=> 0.0
4324  */
4325 
4326 static VALUE
4328 {
4329  return DBL2NUM(rb_str_to_dbl(str, FALSE));
4330 }
4331 
4332 
4333 /*
4334  * call-seq:
4335  * str.to_s -> str
4336  * str.to_str -> str
4337  *
4338  * Returns the receiver.
4339  */
4340 
4341 static VALUE
4343 {
4344  if (rb_obj_class(str) != rb_cString) {
4345  return str_duplicate(rb_cString, str);
4346  }
4347  return str;
4348 }
4349 
4350 #if 0
4351 static void
4352 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
4353 {
4354  char s[RUBY_MAX_CHAR_LEN];
4355  int n = rb_enc_codelen(c, enc);
4356 
4357  rb_enc_mbcput(c, s, enc);
4358  rb_enc_str_buf_cat(str, s, n, enc);
4359 }
4360 #endif
4361 
4362 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
4363 
4364 int
4365 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
4366 {
4367  char buf[CHAR_ESC_LEN + 1];
4368  int l;
4369 
4370 #if SIZEOF_INT > 4
4371  c &= 0xffffffff;
4372 #endif
4373  if (unicode_p) {
4374  if (c < 0x7F && ISPRINT(c)) {
4375  snprintf(buf, CHAR_ESC_LEN, "%c", c);
4376  }
4377  else if (c < 0x10000) {
4378  snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
4379  }
4380  else {
4381  snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
4382  }
4383  }
4384  else {
4385  if (c < 0x100) {
4386  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
4387  }
4388  else {
4389  snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
4390  }
4391  }
4392  l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
4393  rb_str_buf_cat(result, buf, l);
4394  return l;
4395 }
4396 
4397 /*
4398  * call-seq:
4399  * str.inspect -> string
4400  *
4401  * Returns a printable version of _str_, surrounded by quote marks,
4402  * with special characters escaped.
4403  *
4404  * str = "hello"
4405  * str[3] = "\b"
4406  * str.inspect #=> "\"hel\\bo\""
4407  */
4408 
4409 VALUE
4411 {
4412  rb_encoding *enc = STR_ENC_GET(str);
4413  const char *p, *pend, *prev;
4414  char buf[CHAR_ESC_LEN + 1];
4417  int unicode_p = rb_enc_unicode_p(enc);
4418  int asciicompat = rb_enc_asciicompat(enc);
4419  static rb_encoding *utf16, *utf32;
4420 
4421  if (!utf16) utf16 = rb_enc_find("UTF-16");
4422  if (!utf32) utf32 = rb_enc_find("UTF-32");
4423  if (resenc == NULL) resenc = rb_default_external_encoding();
4424  if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
4425  rb_enc_associate(result, resenc);
4426  str_buf_cat2(result, "\"");
4427 
4428  p = RSTRING_PTR(str); pend = RSTRING_END(str);
4429  prev = p;
4430  if (enc == utf16) {
4431  const unsigned char *q = (const unsigned char *)p;
4432  if (q[0] == 0xFE && q[1] == 0xFF)
4433  enc = rb_enc_find("UTF-16BE");
4434  else if (q[0] == 0xFF && q[1] == 0xFE)
4435  enc = rb_enc_find("UTF-16LE");
4436  else
4437  unicode_p = 0;
4438  }
4439  else if (enc == utf32) {
4440  const unsigned char *q = (const unsigned char *)p;
4441  if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
4442  enc = rb_enc_find("UTF-32BE");
4443  else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
4444  enc = rb_enc_find("UTF-32LE");
4445  else
4446  unicode_p = 0;
4447  }
4448  while (p < pend) {
4449  unsigned int c, cc;
4450  int n;
4451 
4452  n = rb_enc_precise_mbclen(p, pend, enc);
4453  if (!MBCLEN_CHARFOUND_P(n)) {
4454  if (p > prev) str_buf_cat(result, prev, p - prev);
4455  n = rb_enc_mbminlen(enc);
4456  if (pend < p + n)
4457  n = (int)(pend - p);
4458  while (n--) {
4459  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
4460  str_buf_cat(result, buf, strlen(buf));
4461  prev = ++p;
4462  }
4463  continue;
4464  }
4465  n = MBCLEN_CHARFOUND_LEN(n);
4466  c = rb_enc_mbc_to_codepoint(p, pend, enc);
4467  p += n;
4468  if ((asciicompat || unicode_p) &&
4469  (c == '"'|| c == '\\' ||
4470  (c == '#' &&
4471  p < pend &&
4473  (cc = rb_enc_codepoint(p,pend,enc),
4474  (cc == '$' || cc == '@' || cc == '{'))))) {
4475  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4476  str_buf_cat2(result, "\\");
4477  if (asciicompat || enc == resenc) {
4478  prev = p - n;
4479  continue;
4480  }
4481  }
4482  switch (c) {
4483  case '\n': cc = 'n'; break;
4484  case '\r': cc = 'r'; break;
4485  case '\t': cc = 't'; break;
4486  case '\f': cc = 'f'; break;
4487  case '\013': cc = 'v'; break;
4488  case '\010': cc = 'b'; break;
4489  case '\007': cc = 'a'; break;
4490  case 033: cc = 'e'; break;
4491  default: cc = 0; break;
4492  }
4493  if (cc) {
4494  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4495  buf[0] = '\\';
4496  buf[1] = (char)cc;
4497  str_buf_cat(result, buf, 2);
4498  prev = p;
4499  continue;
4500  }
4501  if ((enc == resenc && rb_enc_isprint(c, enc)) ||
4502  (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
4503  continue;
4504  }
4505  else {
4506  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4507  rb_str_buf_cat_escaped_char(result, c, unicode_p);
4508  prev = p;
4509  continue;
4510  }
4511  }
4512  if (p > prev) str_buf_cat(result, prev, p - prev);
4513  str_buf_cat2(result, "\"");
4514 
4515  OBJ_INFECT(result, str);
4516  return result;
4517 }
4518 
4519 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
4520 
4521 /*
4522  * call-seq:
4523  * str.dump -> new_str
4524  *
4525  * Produces a version of <i>str</i> with all nonprinting characters replaced by
4526  * <code>\nnn</code> notation and all special characters escaped.
4527  */
4528 
4529 VALUE
4531 {
4532  rb_encoding *enc = rb_enc_get(str);
4533  long len;
4534  const char *p, *pend;
4535  char *q, *qend;
4536  VALUE result;
4537  int u8 = (enc == rb_utf8_encoding());
4538 
4539  len = 2; /* "" */
4540  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4541  while (p < pend) {
4542  unsigned char c = *p++;
4543  switch (c) {
4544  case '"': case '\\':
4545  case '\n': case '\r':
4546  case '\t': case '\f':
4547  case '\013': case '\010': case '\007': case '\033':
4548  len += 2;
4549  break;
4550 
4551  case '#':
4552  len += IS_EVSTR(p, pend) ? 2 : 1;
4553  break;
4554 
4555  default:
4556  if (ISPRINT(c)) {
4557  len++;
4558  }
4559  else {
4560  if (u8) { /* \u{NN} */
4561  int n = rb_enc_precise_mbclen(p-1, pend, enc);
4562  if (MBCLEN_CHARFOUND_P(n-1)) {
4563  unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4564  while (cc >>= 4) len++;
4565  len += 5;
4566  p += MBCLEN_CHARFOUND_LEN(n)-1;
4567  break;
4568  }
4569  }
4570  len += 4; /* \xNN */
4571  }
4572  break;
4573  }
4574  }
4575  if (!rb_enc_asciicompat(enc)) {
4576  len += 19; /* ".force_encoding('')" */
4577  len += strlen(enc->name);
4578  }
4579 
4580  result = rb_str_new5(str, 0, len);
4581  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4582  q = RSTRING_PTR(result); qend = q + len + 1;
4583 
4584  *q++ = '"';
4585  while (p < pend) {
4586  unsigned char c = *p++;
4587 
4588  if (c == '"' || c == '\\') {
4589  *q++ = '\\';
4590  *q++ = c;
4591  }
4592  else if (c == '#') {
4593  if (IS_EVSTR(p, pend)) *q++ = '\\';
4594  *q++ = '#';
4595  }
4596  else if (c == '\n') {
4597  *q++ = '\\';
4598  *q++ = 'n';
4599  }
4600  else if (c == '\r') {
4601  *q++ = '\\';
4602  *q++ = 'r';
4603  }
4604  else if (c == '\t') {
4605  *q++ = '\\';
4606  *q++ = 't';
4607  }
4608  else if (c == '\f') {
4609  *q++ = '\\';
4610  *q++ = 'f';
4611  }
4612  else if (c == '\013') {
4613  *q++ = '\\';
4614  *q++ = 'v';
4615  }
4616  else if (c == '\010') {
4617  *q++ = '\\';
4618  *q++ = 'b';
4619  }
4620  else if (c == '\007') {
4621  *q++ = '\\';
4622  *q++ = 'a';
4623  }
4624  else if (c == '\033') {
4625  *q++ = '\\';
4626  *q++ = 'e';
4627  }
4628  else if (ISPRINT(c)) {
4629  *q++ = c;
4630  }
4631  else {
4632  *q++ = '\\';
4633  if (u8) {
4634  int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
4635  if (MBCLEN_CHARFOUND_P(n)) {
4636  int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4637  p += n;
4638  snprintf(q, qend-q, "u{%x}", cc);
4639  q += strlen(q);
4640  continue;
4641  }
4642  }
4643  snprintf(q, qend-q, "x%02X", c);
4644  q += 3;
4645  }
4646  }
4647  *q++ = '"';
4648  *q = '\0';
4649  if (!rb_enc_asciicompat(enc)) {
4650  snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
4651  enc = rb_ascii8bit_encoding();
4652  }
4653  OBJ_INFECT(result, str);
4654  /* result from dump is ASCII */
4655  rb_enc_associate(result, enc);
4657  return result;
4658 }
4659 
4660 
4661 static void
4663 {
4664  if (rb_enc_dummy_p(enc)) {
4665  rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
4666  rb_enc_name(enc));
4667  }
4668 }
4669 
4670 /*
4671  * call-seq:
4672  * str.upcase! -> str or nil
4673  *
4674  * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
4675  * were made.
4676  * Note: case replacement is effective only in ASCII region.
4677  */
4678 
4679 static VALUE
4681 {
4682  rb_encoding *enc;
4683  char *s, *send;
4684  int modify = 0;
4685  int n;
4686 
4687  str_modify_keep_cr(str);
4688  enc = STR_ENC_GET(str);
4690  s = RSTRING_PTR(str); send = RSTRING_END(str);
4691  if (single_byte_optimizable(str)) {
4692  while (s < send) {
4693  unsigned int c = *(unsigned char*)s;
4694 
4695  if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4696  *s = 'A' + (c - 'a');
4697  modify = 1;
4698  }
4699  s++;
4700  }
4701  }
4702  else {
4703  int ascompat = rb_enc_asciicompat(enc);
4704 
4705  while (s < send) {
4706  unsigned int c;
4707 
4708  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4709  if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4710  *s = 'A' + (c - 'a');
4711  modify = 1;
4712  }
4713  s++;
4714  }
4715  else {
4716  c = rb_enc_codepoint_len(s, send, &n, enc);
4717  if (rb_enc_islower(c, enc)) {
4718  /* assuming toupper returns codepoint with same size */
4719  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4720  modify = 1;
4721  }
4722  s += n;
4723  }
4724  }
4725  }
4726 
4727  if (modify) return str;
4728  return Qnil;
4729 }
4730 
4731 
4732 /*
4733  * call-seq:
4734  * str.upcase -> new_str
4735  *
4736  * Returns a copy of <i>str</i> with all lowercase letters replaced with their
4737  * uppercase counterparts. The operation is locale insensitive---only
4738  * characters ``a'' to ``z'' are affected.
4739  * Note: case replacement is effective only in ASCII region.
4740  *
4741  * "hEllO".upcase #=> "HELLO"
4742  */
4743 
4744 static VALUE
4746 {
4747  str = rb_str_dup(str);
4748  rb_str_upcase_bang(str);
4749  return str;
4750 }
4751 
4752 
4753 /*
4754  * call-seq:
4755  * str.downcase! -> str or nil
4756  *
4757  * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4758  * changes were made.
4759  * Note: case replacement is effective only in ASCII region.
4760  */
4761 
4762 static VALUE
4764 {
4765  rb_encoding *enc;
4766  char *s, *send;
4767  int modify = 0;
4768 
4769  str_modify_keep_cr(str);
4770  enc = STR_ENC_GET(str);
4772  s = RSTRING_PTR(str); send = RSTRING_END(str);
4773  if (single_byte_optimizable(str)) {
4774  while (s < send) {
4775  unsigned int c = *(unsigned char*)s;
4776 
4777  if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4778  *s = 'a' + (c - 'A');
4779  modify = 1;
4780  }
4781  s++;
4782  }
4783  }
4784  else {
4785  int ascompat = rb_enc_asciicompat(enc);
4786 
4787  while (s < send) {
4788  unsigned int c;
4789  int n;
4790 
4791  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4792  if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4793  *s = 'a' + (c - 'A');
4794  modify = 1;
4795  }
4796  s++;
4797  }
4798  else {
4799  c = rb_enc_codepoint_len(s, send, &n, enc);
4800  if (rb_enc_isupper(c, enc)) {
4801  /* assuming toupper returns codepoint with same size */
4802  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4803  modify = 1;
4804  }
4805  s += n;
4806  }
4807  }
4808  }
4809 
4810  if (modify) return str;
4811  return Qnil;
4812 }
4813 
4814 
4815 /*
4816  * call-seq:
4817  * str.downcase -> new_str
4818  *
4819  * Returns a copy of <i>str</i> with all uppercase letters replaced with their
4820  * lowercase counterparts. The operation is locale insensitive---only
4821  * characters ``A'' to ``Z'' are affected.
4822  * Note: case replacement is effective only in ASCII region.
4823  *
4824  * "hEllO".downcase #=> "hello"
4825  */
4826 
4827 static VALUE
4829 {
4830  str = rb_str_dup(str);
4831  rb_str_downcase_bang(str);
4832  return str;
4833 }
4834 
4835 
4836 /*
4837  * call-seq:
4838  * str.capitalize! -> str or nil
4839  *
4840  * Modifies <i>str</i> by converting the first character to uppercase and the
4841  * remainder to lowercase. Returns <code>nil</code> if no changes are made.
4842  * Note: case conversion is effective only in ASCII region.
4843  *
4844  * a = "hello"
4845  * a.capitalize! #=> "Hello"
4846  * a #=> "Hello"
4847  * a.capitalize! #=> nil
4848  */
4849 
4850 static VALUE
4852 {
4853  rb_encoding *enc;
4854  char *s, *send;
4855  int modify = 0;
4856  unsigned int c;
4857  int n;
4858 
4859  str_modify_keep_cr(str);
4860  enc = STR_ENC_GET(str);
4862  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4863  s = RSTRING_PTR(str); send = RSTRING_END(str);
4864 
4865  c = rb_enc_codepoint_len(s, send, &n, enc);
4866  if (rb_enc_islower(c, enc)) {
4867  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4868  modify = 1;
4869  }
4870  s += n;
4871  while (s < send) {
4872  c = rb_enc_codepoint_len(s, send, &n, enc);
4873  if (rb_enc_isupper(c, enc)) {
4874  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4875  modify = 1;
4876  }
4877  s += n;
4878  }
4879 
4880  if (modify) return str;
4881  return Qnil;
4882 }
4883 
4884 
4885 /*
4886  * call-seq:
4887  * str.capitalize -> new_str
4888  *
4889  * Returns a copy of <i>str</i> with the first character converted to uppercase
4890  * and the remainder to lowercase.
4891  * Note: case conversion is effective only in ASCII region.
4892  *
4893  * "hello".capitalize #=> "Hello"
4894  * "HELLO".capitalize #=> "Hello"
4895  * "123ABC".capitalize #=> "123abc"
4896  */
4897 
4898 static VALUE
4900 {
4901  str = rb_str_dup(str);
4903  return str;
4904 }
4905 
4906 
4907 /*
4908  * call-seq:
4909  * str.swapcase! -> str or nil
4910  *
4911  * Equivalent to <code>String#swapcase</code>, but modifies the receiver in
4912  * place, returning <i>str</i>, or <code>nil</code> if no changes were made.
4913  * Note: case conversion is effective only in ASCII region.
4914  */
4915 
4916 static VALUE
4918 {
4919  rb_encoding *enc;
4920  char *s, *send;
4921  int modify = 0;
4922  int n;
4923 
4924  str_modify_keep_cr(str);
4925  enc = STR_ENC_GET(str);
4927  s = RSTRING_PTR(str); send = RSTRING_END(str);
4928  while (s < send) {
4929  unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
4930 
4931  if (rb_enc_isupper(c, enc)) {
4932  /* assuming toupper returns codepoint with same size */
4933  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4934  modify = 1;
4935  }
4936  else if (rb_enc_islower(c, enc)) {
4937  /* assuming tolower returns codepoint with same size */
4938  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4939  modify = 1;
4940  }
4941  s += n;
4942  }
4943 
4944  if (modify) return str;
4945  return Qnil;
4946 }
4947 
4948 
4949 /*
4950  * call-seq:
4951  * str.swapcase -> new_str
4952  *
4953  * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
4954  * to lowercase and lowercase characters converted to uppercase.
4955  * Note: case conversion is effective only in ASCII region.
4956  *
4957  * "Hello".swapcase #=> "hELLO"
4958  * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
4959  */
4960 
4961 static VALUE
4963 {
4964  str = rb_str_dup(str);
4965  rb_str_swapcase_bang(str);
4966  return str;
4967 }
4968 
4969 typedef unsigned char *USTR;
4970 
4971 struct tr {
4972  int gen;
4973  unsigned int now, max;
4974  char *p, *pend;
4975 };
4976 
4977 static unsigned int
4978 trnext(struct tr *t, rb_encoding *enc)
4979 {
4980  int n;
4981 
4982  for (;;) {
4983  if (!t->gen) {
4984  if (t->p == t->pend) return -1;
4985  if (t->p < t->pend - 1 && *t->p == '\\') {
4986  t->p++;
4987  }
4988  t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
4989  t->p += n;
4990  if (t->p < t->pend - 1 && *t->p == '-') {
4991  t->p++;
4992  if (t->p < t->pend) {
4993  unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
4994  t->p += n;
4995  if (t->now > c) {
4996  if (t->now < 0x80 && c < 0x80) {
4998  "invalid range \"%c-%c\" in string transliteration",
4999  t->now, c);
5000  }
5001  else {
5002  rb_raise(rb_eArgError, "invalid range in string transliteration");
5003  }
5004  continue; /* not reached */
5005  }
5006  t->gen = 1;
5007  t->max = c;
5008  }
5009  }
5010  return t->now;
5011  }
5012  else if (++t->now < t->max) {
5013  return t->now;
5014  }
5015  else {
5016  t->gen = 0;
5017  return t->max;
5018  }
5019  }
5020 }
5021 
5022 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
5023 
5024 static VALUE
5025 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
5026 {
5027  const unsigned int errc = -1;
5028  unsigned int trans[256];
5029  rb_encoding *enc, *e1, *e2;
5030  struct tr trsrc, trrepl;
5031  int cflag = 0;
5032  unsigned int c, c0, last = 0;
5033  int modify = 0, i, l;
5034  char *s, *send;
5035  VALUE hash = 0;
5036  int singlebyte = single_byte_optimizable(str);
5037  int cr;
5038 
5039 #define CHECK_IF_ASCII(c) \
5040  (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
5041  (cr = ENC_CODERANGE_VALID) : 0)
5042 
5043  StringValue(src);
5044  StringValue(repl);
5045  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5046  if (RSTRING_LEN(repl) == 0) {
5047  return rb_str_delete_bang(1, &src, str);
5048  }
5049 
5050  cr = ENC_CODERANGE(str);
5051  e1 = rb_enc_check(str, src);
5052  e2 = rb_enc_check(str, repl);
5053  if (e1 == e2) {
5054  enc = e1;
5055  }
5056  else {
5057  enc = rb_enc_check(src, repl);
5058  }
5059  trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
5060  if (RSTRING_LEN(src) > 1 &&
5061  rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
5062  trsrc.p + l < trsrc.pend) {
5063  cflag = 1;
5064  trsrc.p += l;
5065  }
5066  trrepl.p = RSTRING_PTR(repl);
5067  trrepl.pend = trrepl.p + RSTRING_LEN(repl);
5068  trsrc.gen = trrepl.gen = 0;
5069  trsrc.now = trrepl.now = 0;
5070  trsrc.max = trrepl.max = 0;
5071 
5072  if (cflag) {
5073  for (i=0; i<256; i++) {
5074  trans[i] = 1;
5075  }
5076  while ((c = trnext(&trsrc, enc)) != errc) {
5077  if (c < 256) {
5078  trans[c] = errc;
5079  }
5080  else {
5081  if (!hash) hash = rb_hash_new();
5082  rb_hash_aset(hash, UINT2NUM(c), Qtrue);
5083  }
5084  }
5085  while ((c = trnext(&trrepl, enc)) != errc)
5086  /* retrieve last replacer */;
5087  last = trrepl.now;
5088  for (i=0; i<256; i++) {
5089  if (trans[i] != errc) {
5090  trans[i] = last;
5091  }
5092  }
5093  }
5094  else {
5095  unsigned int r;
5096 
5097  for (i=0; i<256; i++) {
5098  trans[i] = errc;
5099  }
5100  while ((c = trnext(&trsrc, enc)) != errc) {
5101  r = trnext(&trrepl, enc);
5102  if (r == errc) r = trrepl.now;
5103  if (c < 256) {
5104  trans[c] = r;
5105  if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
5106  }
5107  else {
5108  if (!hash) hash = rb_hash_new();
5109  rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
5110  }
5111  }
5112  }
5113 
5114  if (cr == ENC_CODERANGE_VALID)
5115  cr = ENC_CODERANGE_7BIT;
5116  str_modify_keep_cr(str);
5117  s = RSTRING_PTR(str); send = RSTRING_END(str);
5118  if (sflag) {
5119  int clen, tlen;
5120  long offset, max = RSTRING_LEN(str);
5121  unsigned int save = -1;
5122  char *buf = ALLOC_N(char, max), *t = buf;
5123 
5124  while (s < send) {
5125  int may_modify = 0;
5126 
5127  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5128  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5129 
5130  s += clen;
5131  if (c < 256) {
5132  c = trans[c];
5133  }
5134  else if (hash) {
5135  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5136  if (NIL_P(tmp)) {
5137  if (cflag) c = last;
5138  else c = errc;
5139  }
5140  else if (cflag) c = errc;
5141  else c = NUM2INT(tmp);
5142  }
5143  else {
5144  c = errc;
5145  }
5146  if (c != (unsigned int)-1) {
5147  if (save == c) {
5148  CHECK_IF_ASCII(c);
5149  continue;
5150  }
5151  save = c;
5152  tlen = rb_enc_codelen(c, enc);
5153  modify = 1;
5154  }
5155  else {
5156  save = -1;
5157  c = c0;
5158  if (enc != e1) may_modify = 1;
5159  }
5160  while (t - buf + tlen >= max) {
5161  offset = t - buf;
5162  max *= 2;
5163  REALLOC_N(buf, char, max);
5164  t = buf + offset;
5165  }
5166  rb_enc_mbcput(c, t, enc);
5167  if (may_modify && memcmp(s, t, tlen) != 0) {
5168  modify = 1;
5169  }
5170  CHECK_IF_ASCII(c);
5171  t += tlen;
5172  }
5173  if (!STR_EMBED_P(str)) {
5174  xfree(RSTRING(str)->as.heap.ptr);
5175  }
5176  *t = '\0';
5177  RSTRING(str)->as.heap.ptr = buf;
5178  RSTRING(str)->as.heap.len = t - buf;
5179  STR_SET_NOEMBED(str);
5180  RSTRING(str)->as.heap.aux.capa = max;
5181  }
5182  else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
5183  while (s < send) {
5184  c = (unsigned char)*s;
5185  if (trans[c] != errc) {
5186  if (!cflag) {
5187  c = trans[c];
5188  *s = c;
5189  modify = 1;
5190  }
5191  else {
5192  *s = last;
5193  modify = 1;
5194  }
5195  }
5196  CHECK_IF_ASCII(c);
5197  s++;
5198  }
5199  }
5200  else {
5201  int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
5202  long offset;
5203  char *buf = ALLOC_N(char, max), *t = buf;
5204 
5205  while (s < send) {
5206  int may_modify = 0;
5207  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5208  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5209 
5210  if (c < 256) {
5211  c = trans[c];
5212  }
5213  else if (hash) {
5214  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5215  if (NIL_P(tmp)) {
5216  if (cflag) c = last;
5217  else c = errc;
5218  }
5219  else if (cflag) c = errc;
5220  else c = NUM2INT(tmp);
5221  }
5222  else {
5223  c = cflag ? last : errc;
5224  }
5225  if (c != errc) {
5226  tlen = rb_enc_codelen(c, enc);
5227  modify = 1;
5228  }
5229  else {
5230  c = c0;
5231  if (enc != e1) may_modify = 1;
5232  }
5233  while (t - buf + tlen >= max) {
5234  offset = t - buf;
5235  max *= 2;
5236  REALLOC_N(buf, char, max);
5237  t = buf + offset;
5238  }
5239  if (s != t) {
5240  rb_enc_mbcput(c, t, enc);
5241  if (may_modify && memcmp(s, t, tlen) != 0) {
5242  modify = 1;
5243  }
5244  }
5245  CHECK_IF_ASCII(c);
5246  s += clen;
5247  t += tlen;
5248  }
5249  if (!STR_EMBED_P(str)) {
5250  xfree(RSTRING(str)->as.heap.ptr);
5251  }
5252  *t = '\0';
5253  RSTRING(str)->as.heap.ptr = buf;
5254  RSTRING(str)->as.heap.len = t - buf;
5255  STR_SET_NOEMBED(str);
5256  RSTRING(str)->as.heap.aux.capa = max;
5257  }
5258 
5259  if (modify) {
5260  if (cr != ENC_CODERANGE_BROKEN)
5261  ENC_CODERANGE_SET(str, cr);
5262  rb_enc_associate(str, enc);
5263  return str;
5264  }
5265  return Qnil;
5266 }
5267 
5268 
5269 /*
5270  * call-seq:
5271  * str.tr!(from_str, to_str) -> str or nil
5272  *
5273  * Translates <i>str</i> in place, using the same rules as
5274  * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
5275  * changes were made.
5276  */
5277 
5278 static VALUE
5280 {
5281  return tr_trans(str, src, repl, 0);
5282 }
5283 
5284 
5285 /*
5286  * call-seq:
5287  * str.tr(from_str, to_str) => new_str
5288  *
5289  * Returns a copy of <i>str</i> with the characters in <i>from_str</i>
5290  * replaced by the corresponding characters in <i>to_str</i>. If
5291  * <i>to_str</i> is shorter than <i>from_str</i>, it is padded with its last
5292  * character in order to maintain the correspondence.
5293  *
5294  * "hello".tr('el', 'ip') #=> "hippo"
5295  * "hello".tr('aeiou', '*') #=> "h*ll*"
5296  *
5297  * Both strings may use the c1-c2 notation to denote ranges of characters,
5298  * and <i>from_str</i> may start with a <code>^</code>, which denotes all
5299  * characters except those listed.
5300  *
5301  * "hello".tr('a-y', 'b-z') #=> "ifmmp"
5302  * "hello".tr('^aeiou', '*') #=> "*e**o"
5303  */
5304 
5305 static VALUE
5306 rb_str_tr(VALUE str, VALUE src, VALUE repl)
5307 {
5308  str = rb_str_dup(str);
5309  tr_trans(str, src, repl, 0);
5310  return str;
5311 }
5312 
5313 #define TR_TABLE_SIZE 257
5314 static void
5315 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
5316  VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
5317 {
5318  const unsigned int errc = -1;
5319  char buf[256];
5320  struct tr tr;
5321  unsigned int c;
5322  VALUE table = 0, ptable = 0;
5323  int i, l, cflag = 0;
5324 
5325  tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
5326  tr.gen = tr.now = tr.max = 0;
5327 
5328  if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
5329  cflag = 1;
5330  tr.p += l;
5331  }
5332  if (first) {
5333  for (i=0; i<256; i++) {
5334  stable[i] = 1;
5335  }
5336  stable[256] = cflag;
5337  }
5338  else if (stable[256] && !cflag) {
5339  stable[256] = 0;
5340  }
5341  for (i=0; i<256; i++) {
5342  buf[i] = cflag;
5343  }
5344 
5345  while ((c = trnext(&tr, enc)) != errc) {
5346  if (c < 256) {
5347  buf[c & 0xff] = !cflag;
5348  }
5349  else {
5350  VALUE key = UINT2NUM(c);
5351 
5352  if (!table) {
5353  table = rb_hash_new();
5354  if (cflag) {
5355  ptable = *ctablep;
5356  *ctablep = table;
5357  }
5358  else {
5359  ptable = *tablep;
5360  *tablep = table;
5361  }
5362  }
5363  if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
5364  rb_hash_aset(table, key, Qtrue);
5365  }
5366  }
5367  }
5368  for (i=0; i<256; i++) {
5369  stable[i] = stable[i] && buf[i];
5370  }
5371 }
5372 
5373 
5374 static int
5375 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
5376 {
5377  if (c < 256) {
5378  return table[c] != 0;
5379  }
5380  else {
5381  VALUE v = UINT2NUM(c);
5382 
5383  if (del) {
5384  if (!NIL_P(rb_hash_lookup(del, v)) &&
5385  (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
5386  return TRUE;
5387  }
5388  }
5389  else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
5390  return FALSE;
5391  }
5392  return table[256] ? TRUE : FALSE;
5393  }
5394 }
5395 
5396 /*
5397  * call-seq:
5398  * str.delete!([other_str]+) -> str or nil
5399  *
5400  * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
5401  * <code>nil</code> if <i>str</i> was not modified.
5402  */
5403 
5404 static VALUE
5406 {
5407  char squeez[TR_TABLE_SIZE];
5408  rb_encoding *enc = 0;
5409  char *s, *send, *t;
5410  VALUE del = 0, nodel = 0;
5411  int modify = 0;
5412  int i, ascompat, cr;
5413 
5414  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5415  if (argc < 1) {
5416  rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
5417  }
5418  for (i=0; i<argc; i++) {
5419  VALUE s = argv[i];
5420 
5421  StringValue(s);
5422  enc = rb_enc_check(str, s);
5423  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5424  }
5425 
5426  str_modify_keep_cr(str);
5427  ascompat = rb_enc_asciicompat(enc);
5428  s = t = RSTRING_PTR(str);
5429  send = RSTRING_END(str);
5430  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
5431  while (s < send) {
5432  unsigned int c;
5433  int clen;
5434 
5435  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5436  if (squeez[c]) {
5437  modify = 1;
5438  }
5439  else {
5440  if (t != s) *t = c;
5441  t++;
5442  }
5443  s++;
5444  }
5445  else {
5446  c = rb_enc_codepoint_len(s, send, &clen, enc);
5447 
5448  if (tr_find(c, squeez, del, nodel)) {
5449  modify = 1;
5450  }
5451  else {
5452  if (t != s) rb_enc_mbcput(c, t, enc);
5453  t += clen;
5454  if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
5455  }
5456  s += clen;
5457  }
5458  }
5459  *t = '\0';
5460  STR_SET_LEN(str, t - RSTRING_PTR(str));
5461  ENC_CODERANGE_SET(str, cr);
5462 
5463  if (modify) return str;
5464  return Qnil;
5465 }
5466 
5467 
5468 /*
5469  * call-seq:
5470  * str.delete([other_str]+) -> new_str
5471  *
5472  * Returns a copy of <i>str</i> with all characters in the intersection of its
5473  * arguments deleted. Uses the same rules for building the set of characters as
5474  * <code>String#count</code>.
5475  *
5476  * "hello".delete "l","lo" #=> "heo"
5477  * "hello".delete "lo" #=> "he"
5478  * "hello".delete "aeiou", "^e" #=> "hell"
5479  * "hello".delete "ej-m" #=> "ho"
5480  */
5481 
5482 static VALUE
5484 {
5485  str = rb_str_dup(str);
5486  rb_str_delete_bang(argc, argv, str);
5487  return str;
5488 }
5489 
5490 
5491 /*
5492  * call-seq:
5493  * str.squeeze!([other_str]*) -> str or nil
5494  *
5495  * Squeezes <i>str</i> in place, returning either <i>str</i>, or
5496  * <code>nil</code> if no changes were made.
5497  */
5498 
5499 static VALUE
5501 {
5502  char squeez[TR_TABLE_SIZE];
5503  rb_encoding *enc = 0;
5504  VALUE del = 0, nodel = 0;
5505  char *s, *send, *t;
5506  int i, modify = 0;
5507  int ascompat, singlebyte = single_byte_optimizable(str);
5508  unsigned int save;
5509 
5510  if (argc == 0) {
5511  enc = STR_ENC_GET(str);
5512  }
5513  else {
5514  for (i=0; i<argc; i++) {
5515  VALUE s = argv[i];
5516 
5517  StringValue(s);
5518  enc = rb_enc_check(str, s);
5519  if (singlebyte && !single_byte_optimizable(s))
5520  singlebyte = 0;
5521  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5522  }
5523  }
5524 
5525  str_modify_keep_cr(str);
5526  s = t = RSTRING_PTR(str);
5527  if (!s || RSTRING_LEN(str) == 0) return Qnil;
5528  send = RSTRING_END(str);
5529  save = -1;
5530  ascompat = rb_enc_asciicompat(enc);
5531 
5532  if (singlebyte) {
5533  while (s < send) {
5534  unsigned int c = *(unsigned char*)s++;
5535  if (c != save || (argc > 0 && !squeez[c])) {
5536  *t++ = save = c;
5537  }
5538  }
5539  } else {
5540  while (s < send) {
5541  unsigned int c;
5542  int clen;
5543 
5544  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5545  if (c != save || (argc > 0 && !squeez[c])) {
5546  *t++ = save = c;
5547  }
5548  s++;
5549  }
5550  else {
5551  c = rb_enc_codepoint_len(s, send, &clen, enc);
5552 
5553  if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
5554  if (t != s) rb_enc_mbcput(c, t, enc);
5555  save = c;
5556  t += clen;
5557  }
5558  s += clen;
5559  }
5560  }
5561  }
5562 
5563  *t = '\0';
5564  if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
5565  STR_SET_LEN(str, t - RSTRING_PTR(str));
5566  modify = 1;
5567  }
5568 
5569  if (modify) return str;
5570  return Qnil;
5571 }
5572 
5573 
5574 /*
5575  * call-seq:
5576  * str.squeeze([other_str]*) -> new_str
5577  *
5578  * Builds a set of characters from the <i>other_str</i> parameter(s) using the
5579  * procedure described for <code>String#count</code>. Returns a new string
5580  * where runs of the same character that occur in this set are replaced by a
5581  * single character. If no arguments are given, all runs of identical
5582  * characters are replaced by a single character.
5583  *
5584  * "yellow moon".squeeze #=> "yelow mon"
5585  * " now is the".squeeze(" ") #=> " now is the"
5586  * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
5587  */
5588 
5589 static VALUE
5591 {
5592  str = rb_str_dup(str);
5593  rb_str_squeeze_bang(argc, argv, str);
5594  return str;
5595 }
5596 
5597 
5598 /*
5599  * call-seq:
5600  * str.tr_s!(from_str, to_str) -> str or nil
5601  *
5602  * Performs <code>String#tr_s</code> processing on <i>str</i> in place,
5603  * returning <i>str</i>, or <code>nil</code> if no changes were made.
5604  */
5605 
5606 static VALUE
5608 {
5609  return tr_trans(str, src, repl, 1);
5610 }
5611 
5612 
5613 /*
5614  * call-seq:
5615  * str.tr_s(from_str, to_str) -> new_str
5616  *
5617  * Processes a copy of <i>str</i> as described under <code>String#tr</code>,
5618  * then removes duplicate characters in regions that were affected by the
5619  * translation.
5620  *
5621  * "hello".tr_s('l', 'r') #=> "hero"
5622  * "hello".tr_s('el', '*') #=> "h*o"
5623  * "hello".tr_s('el', 'hx') #=> "hhxo"
5624  */
5625 
5626 static VALUE
5627 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
5628 {
5629  str = rb_str_dup(str);
5630  tr_trans(str, src, repl, 1);
5631  return str;
5632 }
5633 
5634 
5635 /*
5636  * call-seq:
5637  * str.count([other_str]+) -> fixnum
5638  *
5639  * Each <i>other_str</i> parameter defines a set of characters to count. The
5640  * intersection of these sets defines the characters to count in
5641  * <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
5642  * negated. The sequence c1--c2 means all characters between c1 and c2.
5643  *
5644  * a = "hello world"
5645  * a.count "lo" #=> 5
5646  * a.count "lo", "o" #=> 2
5647  * a.count "hello", "^l" #=> 4
5648  * a.count "ej-m" #=> 4
5649  */
5650 
5651 static VALUE
5653 {
5654  char table[TR_TABLE_SIZE];
5655  rb_encoding *enc = 0;
5656  VALUE del = 0, nodel = 0;
5657  char *s, *send;
5658  int i;
5659  int ascompat;
5660 
5661  if (argc < 1) {
5662  rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
5663  }
5664  for (i=0; i<argc; i++) {
5665  VALUE tstr = argv[i];
5666  unsigned char c;
5667 
5668  StringValue(tstr);
5669  enc = rb_enc_check(str, tstr);
5670  if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
5671  (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
5672  int n = 0;
5673 
5674  s = RSTRING_PTR(str);
5675  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5676  send = RSTRING_END(str);
5677  while (s < send) {
5678  if (*(unsigned char*)s++ == c) n++;
5679  }
5680  return INT2NUM(n);
5681  }
5682  tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
5683  }
5684 
5685  s = RSTRING_PTR(str);
5686  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5687  send = RSTRING_END(str);
5688  ascompat = rb_enc_asciicompat(enc);
5689  i = 0;
5690  while (s < send) {
5691  unsigned int c;
5692 
5693  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5694  if (table[c]) {
5695  i++;
5696  }
5697  s++;
5698  }
5699  else {
5700  int clen;
5701  c = rb_enc_codepoint_len(s, send, &clen, enc);
5702  if (tr_find(c, table, del, nodel)) {
5703  i++;
5704  }
5705  s += clen;
5706  }
5707  }
5708 
5709  return INT2NUM(i);
5710 }
5711 
5712 static const char isspacetable[256] = {
5713  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
5714  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5715  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5716  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5717  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5718  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5719  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5720  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5721  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5722  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5723  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5724  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5725  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5726  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5727  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5728  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5729 };
5730 
5731 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
5732 
5733 /*
5734  * call-seq:
5735  * str.split(pattern=$;, [limit]) -> anArray
5736  *
5737  * Divides <i>str</i> into substrings based on a delimiter, returning an array
5738  * of these substrings.
5739  *
5740  * If <i>pattern</i> is a <code>String</code>, then its contents are used as
5741  * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
5742  * space, <i>str</i> is split on whitespace, with leading whitespace and runs
5743  * of contiguous whitespace characters ignored.
5744  *
5745  * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
5746  * pattern matches. Whenever the pattern matches a zero-length string,
5747  * <i>str</i> is split into individual characters. If <i>pattern</i> contains
5748  * groups, the respective matches will be returned in the array as well.
5749  *
5750  * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If
5751  * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
5752  * split on whitespace as if ` ' were specified.
5753  *
5754  * If the <i>limit</i> parameter is omitted, trailing null fields are
5755  * suppressed. If <i>limit</i> is a positive number, at most that number of
5756  * fields will be returned (if <i>limit</i> is <code>1</code>, the entire
5757  * string is returned as the only entry in an array). If negative, there is no
5758  * limit to the number of fields returned, and trailing null fields are not
5759  * suppressed.
5760  *
5761  * " now's the time".split #=> ["now's", "the", "time"]
5762  * " now's the time".split(' ') #=> ["now's", "the", "time"]
5763  * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
5764  * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
5765  * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
5766  * "hello".split(//, 3) #=> ["h", "e", "llo"]
5767  * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
5768  *
5769  * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
5770  * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
5771  * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
5772  * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
5773  */
5774 
5775 static VALUE
5777 {
5778  rb_encoding *enc;
5779  VALUE spat;
5780  VALUE limit;
5781  enum {awk, string, regexp} split_type;
5782  long beg, end, i = 0;
5783  int lim = 0;
5784  VALUE result, tmp;
5785 
5786  if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
5787  lim = NUM2INT(limit);
5788  if (lim <= 0) limit = Qnil;
5789  else if (lim == 1) {
5790  if (RSTRING_LEN(str) == 0)
5791  return rb_ary_new2(0);
5792  return rb_ary_new3(1, str);
5793  }
5794  i = 1;
5795  }
5796 
5797  enc = STR_ENC_GET(str);
5798  if (NIL_P(spat)) {
5799  if (!NIL_P(rb_fs)) {
5800  spat = rb_fs;
5801  goto fs_set;
5802  }
5803  split_type = awk;
5804  }
5805  else {
5806  fs_set:
5807  if (TYPE(spat) == T_STRING) {
5808  rb_encoding *enc2 = STR_ENC_GET(spat);
5809 
5810  split_type = string;
5811  if (RSTRING_LEN(spat) == 0) {
5812  /* Special case - split into chars */
5813  spat = rb_reg_regcomp(spat);
5814  split_type = regexp;
5815  }
5816  else if (rb_enc_asciicompat(enc2) == 1) {
5817  if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
5818  split_type = awk;
5819  }
5820  }
5821  else {
5822  int l;
5823  if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
5824  RSTRING_LEN(spat) == l) {
5825  split_type = awk;
5826  }
5827  }
5828  }
5829  else {
5830  spat = get_pat(spat, 1);
5831  split_type = regexp;
5832  }
5833  }
5834 
5835  result = rb_ary_new();
5836  beg = 0;
5837  if (split_type == awk) {
5838  char *ptr = RSTRING_PTR(str);
5839  char *eptr = RSTRING_END(str);
5840  char *bptr = ptr;
5841  int skip = 1;
5842  unsigned int c;
5843 
5844  end = beg;
5845  if (is_ascii_string(str)) {
5846  while (ptr < eptr) {
5847  c = (unsigned char)*ptr++;
5848  if (skip) {
5849  if (ascii_isspace(c)) {
5850  beg = ptr - bptr;
5851  }
5852  else {
5853  end = ptr - bptr;
5854  skip = 0;
5855  if (!NIL_P(limit) && lim <= i) break;
5856  }
5857  }
5858  else if (ascii_isspace(c)) {
5859  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5860  skip = 1;
5861  beg = ptr - bptr;
5862  if (!NIL_P(limit)) ++i;
5863  }
5864  else {
5865  end = ptr - bptr;
5866  }
5867  }
5868  }
5869  else {
5870  while (ptr < eptr) {
5871  int n;
5872 
5873  c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
5874  ptr += n;
5875  if (skip) {
5876  if (rb_isspace(c)) {
5877  beg = ptr - bptr;
5878  }
5879  else {
5880  end = ptr - bptr;
5881  skip = 0;
5882  if (!NIL_P(limit) && lim <= i) break;
5883  }
5884  }
5885  else if (rb_isspace(c)) {
5886  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5887  skip = 1;
5888  beg = ptr - bptr;
5889  if (!NIL_P(limit)) ++i;
5890  }
5891  else {
5892  end = ptr - bptr;
5893  }
5894  }
5895  }
5896  }
5897  else if (split_type == string) {
5898  char *ptr = RSTRING_PTR(str);
5899  char *temp = ptr;
5900  char *eptr = RSTRING_END(str);
5901  char *sptr = RSTRING_PTR(spat);
5902  long slen = RSTRING_LEN(spat);
5903 
5904  if (is_broken_string(str)) {
5905  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
5906  }
5907  if (is_broken_string(spat)) {
5908  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
5909  }
5910  enc = rb_enc_check(str, spat);
5911  while (ptr < eptr &&
5912  (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
5913  /* Check we are at the start of a char */
5914  char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
5915  if (t != ptr + end) {
5916  ptr = t;
5917  continue;
5918  }
5919  rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
5920  ptr += end + slen;
5921  if (!NIL_P(limit) && lim <= ++i) break;
5922  }
5923  beg = ptr - temp;
5924  }
5925  else {
5926  char *ptr = RSTRING_PTR(str);
5927  long len = RSTRING_LEN(str);
5928  long start = beg;
5929  long idx;
5930  int last_null = 0;
5931  struct re_registers *regs;
5932 
5933  while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
5934  regs = RMATCH_REGS(rb_backref_get());
5935  if (start == end && BEG(0) == END(0)) {
5936  if (!ptr) {
5937  rb_ary_push(result, str_new_empty(str));
5938  break;
5939  }
5940  else if (last_null == 1) {
5941  rb_ary_push(result, rb_str_subseq(str, beg,
5942  rb_enc_fast_mbclen(ptr+beg,
5943  ptr+len,
5944  enc)));
5945  beg = start;
5946  }
5947  else {
5948  if (ptr+start == ptr+len)
5949  start++;
5950  else
5951  start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
5952  last_null = 1;
5953  continue;
5954  }
5955  }
5956  else {
5957  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5958  beg = start = END(0);
5959  }
5960  last_null = 0;
5961 
5962  for (idx=1; idx < regs->num_regs; idx++) {
5963  if (BEG(idx) == -1) continue;
5964  if (BEG(idx) == END(idx))
5965  tmp = str_new_empty(str);
5966  else
5967  tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
5968  rb_ary_push(result, tmp);
5969  }
5970  if (!NIL_P(limit) && lim <= ++i) break;
5971  }
5972  }
5973  if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
5974  if (RSTRING_LEN(str) == beg)
5975  tmp = str_new_empty(str);
5976  else
5977  tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
5978  rb_ary_push(result, tmp);
5979  }
5980  if (NIL_P(limit) && lim == 0) {
5981  long len;
5982  while ((len = RARRAY_LEN(result)) > 0 &&
5983  (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
5984  rb_ary_pop(result);
5985  }
5986 
5987  return result;
5988 }
5989 
5990 VALUE
5991 rb_str_split(VALUE str, const char *sep0)
5992 {
5993  VALUE sep;
5994 
5995  StringValue(str);
5996  sep = rb_str_new2(sep0);
5997  return rb_str_split_m(1, &sep, str);
5998 }
5999 
6000 
6001 /*
6002  * call-seq:
6003  * str.each_line(separator=$/) {|substr| block } -> str
6004  * str.each_line(separator=$/) -> an_enumerator
6005  *
6006  * str.lines(separator=$/) {|substr| block } -> str
6007  * str.lines(separator=$/) -> an_enumerator
6008  *
6009  * Splits <i>str</i> using the supplied parameter as the record separator
6010  * (<code>$/</code> by default), passing each substring in turn to the supplied
6011  * block. If a zero-length record separator is supplied, the string is split
6012  * into paragraphs delimited by multiple successive newlines.
6013  *
6014  * If no block is given, an enumerator is returned instead.
6015  *
6016  * print "Example one\n"
6017  * "hello\nworld".each_line {|s| p s}
6018  * print "Example two\n"
6019  * "hello\nworld".each_line('l') {|s| p s}
6020  * print "Example three\n"
6021  * "hello\n\n\nworld".each_line('') {|s| p s}
6022  *
6023  * <em>produces:</em>
6024  *
6025  * Example one
6026  * "hello\n"
6027  * "world"
6028  * Example two
6029  * "hel"
6030  * "l"
6031  * "o\nworl"
6032  * "d"
6033  * Example three
6034  * "hello\n\n\n"
6035  * "world"
6036  */
6037 
6038 static VALUE
6040 {
6041  rb_encoding *enc;
6042  VALUE rs;
6043  unsigned int newline;
6044  const char *p, *pend, *s, *ptr;
6045  long len, rslen;
6046  VALUE line;
6047  int n;
6048  VALUE orig = str;
6049 
6050  if (argc == 0) {
6051  rs = rb_rs;
6052  }
6053  else {
6054  rb_scan_args(argc, argv, "01", &rs);
6055  }
6056  RETURN_ENUMERATOR(str, argc, argv);
6057  if (NIL_P(rs)) {
6058  rb_yield(str);
6059  return orig;
6060  }
6061  str = rb_str_new4(str);
6062  ptr = p = s = RSTRING_PTR(str);
6063  pend = p + RSTRING_LEN(str);
6064  len = RSTRING_LEN(str);
6065  StringValue(rs);
6066  if (rs == rb_default_rs) {
6067  enc = rb_enc_get(str);
6068  while (p < pend) {
6069  char *p0;
6070 
6071  p = memchr(p, '\n', pend - p);
6072  if (!p) break;
6073  p0 = rb_enc_left_char_head(s, p, pend, enc);
6074  if (!rb_enc_is_newline(p0, pend, enc)) {
6075  p++;
6076  continue;
6077  }
6078  p = p0 + rb_enc_mbclen(p0, pend, enc);
6079  line = rb_str_new5(str, s, p - s);
6080  OBJ_INFECT(line, str);
6081  rb_enc_cr_str_copy_for_substr(line, str);
6082  rb_yield(line);
6083  str_mod_check(str, ptr, len);
6084  s = p;
6085  }
6086  goto finish;
6087  }
6088 
6089  enc = rb_enc_check(str, rs);
6090  rslen = RSTRING_LEN(rs);
6091  if (rslen == 0) {
6092  newline = '\n';
6093  }
6094  else {
6095  newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
6096  }
6097 
6098  while (p < pend) {
6099  unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
6100 
6101  again:
6102  if (rslen == 0 && c == newline) {
6103  p += n;
6104  if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
6105  goto again;
6106  }
6107  while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
6108  p += n;
6109  }
6110  p -= n;
6111  }
6112  if (c == newline &&
6113  (rslen <= 1 ||
6114  (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
6115  line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
6116  OBJ_INFECT(line, str);
6117  rb_enc_cr_str_copy_for_substr(line, str);
6118  rb_yield(line);
6119  str_mod_check(str, ptr, len);
6120  s = p + (rslen ? rslen : n);
6121  }
6122  p += n;
6123  }
6124 
6125  finish:
6126  if (s != pend) {
6127  line = rb_str_new5(str, s, pend - s);
6128  OBJ_INFECT(line, str);
6129  rb_enc_cr_str_copy_for_substr(line, str);
6130  rb_yield(line);
6131  }
6132 
6133  return orig;
6134 }
6135 
6136 
6137 /*
6138  * call-seq:
6139  * str.bytes {|fixnum| block } -> str
6140  * str.bytes -> an_enumerator
6141  *
6142  * str.each_byte {|fixnum| block } -> str
6143  * str.each_byte -> an_enumerator
6144  *
6145  * Passes each byte in <i>str</i> to the given block, or returns
6146  * an enumerator if no block is given.
6147  *
6148  * "hello".each_byte {|c| print c, ' ' }
6149  *
6150  * <em>produces:</em>
6151  *
6152  * 104 101 108 108 111
6153  */
6154 
6155 static VALUE
6157 {
6158  long i;
6159 
6160  RETURN_ENUMERATOR(str, 0, 0);
6161  for (i=0; i<RSTRING_LEN(str); i++) {
6162  rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6163  }
6164  return str;
6165 }
6166 
6167 
6168 /*
6169  * call-seq:
6170  * str.chars {|cstr| block } -> str
6171  * str.chars -> an_enumerator
6172  *
6173  * str.each_char {|cstr| block } -> str
6174  * str.each_char -> an_enumerator
6175  *
6176  * Passes each character in <i>str</i> to the given block, or returns
6177  * an enumerator if no block is given.
6178  *
6179  * "hello".each_char {|c| print c, ' ' }
6180  *
6181  * <em>produces:</em>
6182  *
6183  * h e l l o
6184  */
6185 
6186 static VALUE
6188 {
6189  VALUE orig = str;
6190  long i, len, n;
6191  const char *ptr;
6192  rb_encoding *enc;
6193 
6194  RETURN_ENUMERATOR(str, 0, 0);
6195  str = rb_str_new4(str);
6196  ptr = RSTRING_PTR(str);
6197  len = RSTRING_LEN(str);
6198  enc = rb_enc_get(str);
6199  switch (ENC_CODERANGE(str)) {
6200  case ENC_CODERANGE_VALID:
6201  case ENC_CODERANGE_7BIT:
6202  for (i = 0; i < len; i += n) {
6203  n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
6204  rb_yield(rb_str_subseq(str, i, n));
6205  }
6206  break;
6207  default:
6208  for (i = 0; i < len; i += n) {
6209  n = rb_enc_mbclen(ptr + i, ptr + len, enc);
6210  rb_yield(rb_str_subseq(str, i, n));
6211  }
6212  }
6213  return orig;
6214 }
6215 
6216 /*
6217  * call-seq:
6218  * str.codepoints {|integer| block } -> str
6219  * str.codepoints -> an_enumerator
6220  *
6221  * str.each_codepoint {|integer| block } -> str
6222  * str.each_codepoint -> an_enumerator
6223  *
6224  * Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
6225  * also known as a <i>codepoint</i> when applied to Unicode strings to the
6226  * given block.
6227  *
6228  * If no block is given, an enumerator is returned instead.
6229  *
6230  * "hello\u0639".each_codepoint {|c| print c, ' ' }
6231  *
6232  * <em>produces:</em>
6233  *
6234  * 104 101 108 108 111 1593
6235  */
6236 
6237 static VALUE
6239 {
6240  VALUE orig = str;
6241  int n;
6242  unsigned int c;
6243  const char *ptr, *end;
6244  rb_encoding *enc;
6245 
6246  if (single_byte_optimizable(str)) return rb_str_each_byte(str);
6247  RETURN_ENUMERATOR(str, 0, 0);
6248  str = rb_str_new4(str);
6249  ptr = RSTRING_PTR(str);
6250  end = RSTRING_END(str);
6251  enc = STR_ENC_GET(str);
6252  while (ptr < end) {
6253  c = rb_enc_codepoint_len(ptr, end, &n, enc);
6254  rb_yield(UINT2NUM(c));
6255  ptr += n;
6256  }
6257  return orig;
6258 }
6259 
6260 static long
6262 {
6263  rb_encoding *enc = STR_ENC_GET(str);
6264  const char *p, *p2, *beg, *end;
6265 
6266  beg = RSTRING_PTR(str);
6267  end = beg + RSTRING_LEN(str);
6268  if (beg > end) return 0;
6269  p = rb_enc_prev_char(beg, end, end, enc);
6270  if (!p) return 0;
6271  if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
6272  p2 = rb_enc_prev_char(beg, p, end, enc);
6273  if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
6274  }
6275  return p - beg;
6276 }
6277 
6278 /*
6279  * call-seq:
6280  * str.chop! -> str or nil
6281  *
6282  * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
6283  * or <code>nil</code> if <i>str</i> is the empty string. See also
6284  * <code>String#chomp!</code>.
6285  */
6286 
6287 static VALUE
6289 {
6290  str_modify_keep_cr(str);
6291  if (RSTRING_LEN(str) > 0) {
6292  long len;
6293  len = chopped_length(str);
6294  STR_SET_LEN(str, len);
6295  RSTRING_PTR(str)[len] = '\0';
6296  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6297  ENC_CODERANGE_CLEAR(str);
6298  }
6299  return str;
6300  }
6301  return Qnil;
6302 }
6303 
6304 
6305 /*
6306  * call-seq:
6307  * str.chop -> new_str
6308  *
6309  * Returns a new <code>String</code> with the last character removed. If the
6310  * string ends with <code>\r\n</code>, both characters are removed. Applying
6311  * <code>chop</code> to an empty string returns an empty
6312  * string. <code>String#chomp</code> is often a safer alternative, as it leaves
6313  * the string unchanged if it doesn't end in a record separator.
6314  *
6315  * "string\r\n".chop #=> "string"
6316  * "string\n\r".chop #=> "string\n"
6317  * "string\n".chop #=> "string"
6318  * "string".chop #=> "strin"
6319  * "x".chop.chop #=> ""
6320  */
6321 
6322 static VALUE
6324 {
6325  VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
6326  rb_enc_cr_str_copy_for_substr(str2, str);
6327  OBJ_INFECT(str2, str);
6328  return str2;
6329 }
6330 
6331 
6332 /*
6333  * call-seq:
6334  * str.chomp!(separator=$/) -> str or nil
6335  *
6336  * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
6337  * returning <i>str</i>, or <code>nil</code> if no modifications were made.
6338  */
6339 
6340 static VALUE
6342 {
6343  rb_encoding *enc;
6344  VALUE rs;
6345  int newline;
6346  char *p, *pp, *e;
6347  long len, rslen;
6348 
6349  str_modify_keep_cr(str);
6350  len = RSTRING_LEN(str);
6351  if (len == 0) return Qnil;
6352  p = RSTRING_PTR(str);
6353  e = p + len;
6354  if (argc == 0) {
6355  rs = rb_rs;
6356  if (rs == rb_default_rs) {
6357  smart_chomp:
6358  enc = rb_enc_get(str);
6359  if (rb_enc_mbminlen(enc) > 1) {
6360  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
6361  if (rb_enc_is_newline(pp, e, enc)) {
6362  e = pp;
6363  }
6364  pp = e - rb_enc_mbminlen(enc);
6365  if (pp >= p) {
6366  pp = rb_enc_left_char_head(p, pp, e, enc);
6367  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
6368  e = pp;
6369  }
6370  }
6371  if (e == RSTRING_END(str)) {
6372  return Qnil;
6373  }
6374  len = e - RSTRING_PTR(str);
6375  STR_SET_LEN(str, len);
6376  }
6377  else {
6378  if (RSTRING_PTR(str)[len-1] == '\n') {
6379  STR_DEC_LEN(str);
6380  if (RSTRING_LEN(str) > 0 &&
6381  RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
6382  STR_DEC_LEN(str);
6383  }
6384  }
6385  else if (RSTRING_PTR(str)[len-1] == '\r') {
6386  STR_DEC_LEN(str);
6387  }
6388  else {
6389  return Qnil;
6390  }
6391  }
6392  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6393  return str;
6394  }
6395  }
6396  else {
6397  rb_scan_args(argc, argv, "01", &rs);
6398  }
6399  if (NIL_P(rs)) return Qnil;
6400  StringValue(rs);
6401  rslen = RSTRING_LEN(rs);
6402  if (rslen == 0) {
6403  while (len>0 && p[len-1] == '\n') {
6404  len--;
6405  if (len>0 && p[len-1] == '\r')
6406  len--;
6407  }
6408  if (len < RSTRING_LEN(str)) {
6409  STR_SET_LEN(str, len);
6410  RSTRING_PTR(str)[len] = '\0';
6411  return str;
6412  }
6413  return Qnil;
6414  }
6415  if (rslen > len) return Qnil;
6416  newline = RSTRING_PTR(rs)[rslen-1];
6417  if (rslen == 1 && newline == '\n')
6418  goto smart_chomp;
6419 
6420  enc = rb_enc_check(str, rs);
6421  if (is_broken_string(rs)) {
6422  return Qnil;
6423  }
6424  pp = e - rslen;
6425  if (p[len-1] == newline &&
6426  (rslen <= 1 ||
6427  memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
6428  if (rb_enc_left_char_head(p, pp, e, enc) != pp)
6429  return Qnil;
6430  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6431  ENC_CODERANGE_CLEAR(str);
6432  }
6433  STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
6434  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6435  return str;
6436  }
6437  return Qnil;
6438 }
6439 
6440 
6441 /*
6442  * call-seq:
6443  * str.chomp(separator=$/) -> new_str
6444  *
6445  * Returns a new <code>String</code> with the given record separator removed
6446  * from the end of <i>str</i> (if present). If <code>$/</code> has not been
6447  * changed from the default Ruby record separator, then <code>chomp</code> also
6448  * removes carriage return characters (that is it will remove <code>\n</code>,
6449  * <code>\r</code>, and <code>\r\n</code>).
6450  *
6451  * "hello".chomp #=> "hello"
6452  * "hello\n".chomp #=> "hello"
6453  * "hello\r\n".chomp #=> "hello"
6454  * "hello\n\r".chomp #=> "hello\n"
6455  * "hello\r".chomp #=> "hello"
6456  * "hello \n there".chomp #=> "hello \n there"
6457  * "hello".chomp("llo") #=> "he"
6458  */
6459 
6460 static VALUE
6462 {
6463  str = rb_str_dup(str);
6464  rb_str_chomp_bang(argc, argv, str);
6465  return str;
6466 }
6467 
6468 /*
6469  * call-seq:
6470  * str.lstrip! -> self or nil
6471  *
6472  * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
6473  * change was made. See also <code>String#rstrip!</code> and
6474  * <code>String#strip!</code>.
6475  *
6476  * " hello ".lstrip #=> "hello "
6477  * "hello".lstrip! #=> nil
6478  */
6479 
6480 static VALUE
6482 {
6483  rb_encoding *enc;
6484  char *s, *t, *e;
6485 
6486  str_modify_keep_cr(str);
6487  enc = STR_ENC_GET(str);
6488  s = RSTRING_PTR(str);
6489  if (!s || RSTRING_LEN(str) == 0) return Qnil;
6490  e = t = RSTRING_END(str);
6491  /* remove spaces at head */
6492  while (s < e) {
6493  int n;
6494  unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
6495 
6496  if (!rb_isspace(cc)) break;
6497  s += n;
6498  }
6499 
6500  if (s > RSTRING_PTR(str)) {
6501  STR_SET_LEN(str, t-s);
6502  memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
6503  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6504  return str;
6505  }
6506  return Qnil;
6507 }
6508 
6509 
6510 /*
6511  * call-seq:
6512  * str.lstrip -> new_str
6513  *
6514  * Returns a copy of <i>str</i> with leading whitespace removed. See also
6515  * <code>String#rstrip</code> and <code>String#strip</code>.
6516  *
6517  * " hello ".lstrip #=> "hello "
6518  * "hello".lstrip #=> "hello"
6519  */
6520 
6521 static VALUE
6523 {
6524  str = rb_str_dup(str);
6525  rb_str_lstrip_bang(str);
6526  return str;
6527 }
6528 
6529 
6530 /*
6531  * call-seq:
6532  * str.rstrip! -> self or nil
6533  *
6534  * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
6535  * no change was made. See also <code>String#lstrip!</code> and
6536  * <code>String#strip!</code>.
6537  *
6538  * " hello ".rstrip #=> " hello"
6539  * "hello".rstrip! #=> nil
6540  */
6541 
6542 static VALUE
6544 {
6545  rb_encoding *enc;
6546  char *s, *t, *e;
6547 
6548  str_modify_keep_cr(str);
6549  enc = STR_ENC_GET(str);
6551  s = RSTRING_PTR(str);
6552  if (!s || RSTRING_LEN(str) == 0) return Qnil;
6553  t = e = RSTRING_END(str);
6554 
6555  /* remove trailing spaces or '\0's */
6556  if (single_byte_optimizable(str)) {
6557  unsigned char c;
6558  while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
6559  }
6560  else {
6561  char *tp;
6562 
6563  while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
6564  unsigned int c = rb_enc_codepoint(tp, e, enc);
6565  if (c && !rb_isspace(c)) break;
6566  t = tp;
6567  }
6568  }
6569  if (t < e) {
6570  long len = t-RSTRING_PTR(str);
6571 
6572  STR_SET_LEN(str, len);
6573  RSTRING_PTR(str)[len] = '\0';
6574  return str;
6575  }
6576  return Qnil;
6577 }
6578 
6579 
6580 /*
6581  * call-seq:
6582  * str.rstrip -> new_str
6583  *
6584  * Returns a copy of <i>str</i> with trailing whitespace removed. See also
6585  * <code>String#lstrip</code> and <code>String#strip</code>.
6586  *
6587  * " hello ".rstrip #=> " hello"
6588  * "hello".rstrip #=> "hello"
6589  */
6590 
6591 static VALUE
6593 {
6594  str = rb_str_dup(str);
6595  rb_str_rstrip_bang(str);
6596  return str;
6597 }
6598 
6599 
6600 /*
6601  * call-seq:
6602  * str.strip! -> str or nil
6603  *
6604  * Removes leading and trailing whitespace from <i>str</i>. Returns
6605  * <code>nil</code> if <i>str</i> was not altered.
6606  */
6607 
6608 static VALUE
6610 {
6611  VALUE l = rb_str_lstrip_bang(str);
6612  VALUE r = rb_str_rstrip_bang(str);
6613 
6614  if (NIL_P(l) && NIL_P(r)) return Qnil;
6615  return str;
6616 }
6617 
6618 
6619 /*
6620  * call-seq:
6621  * str.strip -> new_str
6622  *
6623  * Returns a copy of <i>str</i> with leading and trailing whitespace removed.
6624  *
6625  * " hello ".strip #=> "hello"
6626  * "\tgoodbye\r\n".strip #=> "goodbye"
6627  */
6628 
6629 static VALUE
6631 {
6632  str = rb_str_dup(str);
6633  rb_str_strip_bang(str);
6634  return str;
6635 }
6636 
6637 static VALUE
6638 scan_once(VALUE str, VALUE pat, long *start)
6639 {
6640  VALUE result, match;
6641  struct re_registers *regs;
6642  int i;
6643 
6644  if (rb_reg_search(pat, str, *start, 0) >= 0) {
6645  match = rb_backref_get();
6646  regs = RMATCH_REGS(match);
6647  if (BEG(0) == END(0)) {
6648  rb_encoding *enc = STR_ENC_GET(str);
6649  /*
6650  * Always consume at least one character of the input string
6651  */
6652  if (RSTRING_LEN(str) > END(0))
6653  *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
6654  RSTRING_END(str), enc);
6655  else
6656  *start = END(0)+1;
6657  }
6658  else {
6659  *start = END(0);
6660  }
6661  if (regs->num_regs == 1) {
6662  return rb_reg_nth_match(0, match);
6663  }
6664  result = rb_ary_new2(regs->num_regs);
6665  for (i=1; i < regs->num_regs; i++) {
6666  rb_ary_push(result, rb_reg_nth_match(i, match));
6667  }
6668 
6669  return result;
6670  }
6671  return Qnil;
6672 }
6673 
6674 
6675 /*
6676  * call-seq:
6677  * str.scan(pattern) -> array
6678  * str.scan(pattern) {|match, ...| block } -> str
6679  *
6680  * Both forms iterate through <i>str</i>, matching the pattern (which may be a
6681  * <code>Regexp</code> or a <code>String</code>). For each match, a result is
6682  * generated and either added to the result array or passed to the block. If
6683  * the pattern contains no groups, each individual result consists of the
6684  * matched string, <code>$&</code>. If the pattern contains groups, each
6685  * individual result is itself an array containing one entry per group.
6686  *
6687  * a = "cruel world"
6688  * a.scan(/\w+/) #=> ["cruel", "world"]
6689  * a.scan(/.../) #=> ["cru", "el ", "wor"]
6690  * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
6691  * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
6692  *
6693  * And the block form:
6694  *
6695  * a.scan(/\w+/) {|w| print "<<#{w}>> " }
6696  * print "\n"
6697  * a.scan(/(.)(.)/) {|x,y| print y, x }
6698  * print "\n"
6699  *
6700  * <em>produces:</em>
6701  *
6702  * <<cruel>> <<world>>
6703  * rceu lowlr
6704  */
6705 
6706 static VALUE
6708 {
6709  VALUE result;
6710  long start = 0;
6711  long last = -1, prev = 0;
6712  char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
6713 
6714  pat = get_pat(pat, 1);
6715  if (!rb_block_given_p()) {
6716  VALUE ary = rb_ary_new();
6717 
6718  while (!NIL_P(result = scan_once(str, pat, &start))) {
6719  last = prev;
6720  prev = start;
6721  rb_ary_push(ary, result);
6722  }
6723  if (last >= 0) rb_reg_search(pat, str, last, 0);
6724  return ary;
6725  }
6726 
6727  while (!NIL_P(result = scan_once(str, pat, &start))) {
6728  last = prev;
6729  prev = start;
6730  rb_yield(result);
6731  str_mod_check(str, p, len);
6732  }
6733  if (last >= 0) rb_reg_search(pat, str, last, 0);
6734  return str;
6735 }
6736 
6737 
6738 /*
6739  * call-seq:
6740  * str.hex -> integer
6741  *
6742  * Treats leading characters from <i>str</i> as a string of hexadecimal digits
6743  * (with an optional sign and an optional <code>0x</code>) and returns the
6744  * corresponding number. Zero is returned on error.
6745  *
6746  * "0x0a".hex #=> 10
6747  * "-1234".hex #=> -4660
6748  * "0".hex #=> 0
6749  * "wombat".hex #=> 0
6750  */
6751 
6752 static VALUE
6754 {
6755  rb_encoding *enc = rb_enc_get(str);
6756 
6757  if (!rb_enc_asciicompat(enc)) {
6758  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
6759  }
6760  return rb_str_to_inum(str, 16, FALSE);
6761 }
6762 
6763 
6764 /*
6765  * call-seq:
6766  * str.oct -> integer
6767  *
6768  * Treats leading characters of <i>str</i> as a string of octal digits (with an
6769  * optional sign) and returns the corresponding number. Returns 0 if the
6770  * conversion fails.
6771  *
6772  * "123".oct #=> 83
6773  * "-377".oct #=> -255
6774  * "bad".oct #=> 0
6775  * "0377bad".oct #=> 255
6776  */
6777 
6778 static VALUE
6780 {
6781  rb_encoding *enc = rb_enc_get(str);
6782 
6783  if (!rb_enc_asciicompat(enc)) {
6784  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
6785  }
6786  return rb_str_to_inum(str, -8, FALSE);
6787 }
6788 
6789 
6790 /*
6791  * call-seq:
6792  * str.crypt(other_str) -> new_str
6793  *
6794  * Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
6795  * library function <code>crypt</code>. The argument is the salt string, which
6796  * should be two characters long, each character drawn from
6797  * <code>[a-zA-Z0-9./]</code>.
6798  */
6799 
6800 static VALUE
6802 {
6803  extern char *crypt(const char *, const char *);
6804  VALUE result;
6805  const char *s, *saltp;
6806  char *res;
6807 #ifdef BROKEN_CRYPT
6808  char salt_8bit_clean[3];
6809 #endif
6810 
6811  StringValue(salt);
6812  if (RSTRING_LEN(salt) < 2)
6813  rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
6814 
6815  s = RSTRING_PTR(str);
6816  if (!s) s = "";
6817  saltp = RSTRING_PTR(salt);
6818 #ifdef BROKEN_CRYPT
6819  if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
6820  salt_8bit_clean[0] = saltp[0] & 0x7f;
6821  salt_8bit_clean[1] = saltp[1] & 0x7f;
6822  salt_8bit_clean[2] = '\0';
6823  saltp = salt_8bit_clean;
6824  }
6825 #endif
6826  res = crypt(s, saltp);
6827  if (!res) {
6828  rb_sys_fail("crypt");
6829  }
6830  result = rb_str_new2(res);
6831  OBJ_INFECT(result, str);
6832  OBJ_INFECT(result, salt);
6833  return result;
6834 }
6835 
6836 
6837 /*
6838  * call-seq:
6839  * str.intern -> symbol
6840  * str.to_sym -> symbol
6841  *
6842  * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
6843  * symbol if it did not previously exist. See <code>Symbol#id2name</code>.
6844  *
6845  * "Koala".intern #=> :Koala
6846  * s = 'cat'.to_sym #=> :cat
6847  * s == :cat #=> true
6848  * s = '@cat'.to_sym #=> :@cat
6849  * s == :@cat #=> true
6850  *
6851  * This can also be used to create symbols that cannot be represented using the
6852  * <code>:xxx</code> notation.
6853  *
6854  * 'cat and dog'.to_sym #=> :"cat and dog"
6855  */
6856 
6857 VALUE
6859 {
6860  VALUE str = RB_GC_GUARD(s);
6861  ID id;
6862 
6863  id = rb_intern_str(str);
6864  return ID2SYM(id);
6865 }
6866 
6867 
6868 /*
6869  * call-seq:
6870  * str.ord -> integer
6871  *
6872  * Return the <code>Integer</code> ordinal of a one-character string.
6873  *
6874  * "a".ord #=> 97
6875  */
6876 
6877 VALUE
6879 {
6880  unsigned int c;
6881 
6883  return UINT2NUM(c);
6884 }
6885 /*
6886  * call-seq:
6887  * str.sum(n=16) -> integer
6888  *
6889  * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
6890  * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
6891  * to 16. The result is simply the sum of the binary value of each character in
6892  * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
6893  * checksum.
6894  */
6895 
6896 static VALUE
6898 {
6899  VALUE vbits;
6900  int bits;
6901  char *ptr, *p, *pend;
6902  long len;
6903  VALUE sum = INT2FIX(0);
6904  unsigned long sum0 = 0;
6905 
6906  if (argc == 0) {
6907  bits = 16;
6908  }
6909  else {
6910  rb_scan_args(argc, argv, "01", &vbits);
6911  bits = NUM2INT(vbits);
6912  }
6913  ptr = p = RSTRING_PTR(str);
6914  len = RSTRING_LEN(str);
6915  pend = p + len;
6916 
6917  while (p < pend) {
6918  if (FIXNUM_MAX - UCHAR_MAX < sum0) {
6919  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6920  str_mod_check(str, ptr, len);
6921  sum0 = 0;
6922  }
6923  sum0 += (unsigned char)*p;
6924  p++;
6925  }
6926 
6927  if (bits == 0) {
6928  if (sum0) {
6929  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6930  }
6931  }
6932  else {
6933  if (sum == INT2FIX(0)) {
6934  if (bits < (int)sizeof(long)*CHAR_BIT) {
6935  sum0 &= (((unsigned long)1)<<bits)-1;
6936  }
6937  sum = LONG2FIX(sum0);
6938  }
6939  else {
6940  VALUE mod;
6941 
6942  if (sum0) {
6943  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6944  }
6945 
6946  mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
6947  mod = rb_funcall(mod, '-', 1, INT2FIX(1));
6948  sum = rb_funcall(sum, '&', 1, mod);
6949  }
6950  }
6951  return sum;
6952 }
6953 
6954 static VALUE
6955 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
6956 {
6957  rb_encoding *enc;
6958  VALUE w;
6959  long width, len, flen = 1, fclen = 1;
6960  VALUE res;
6961  char *p;
6962  const char *f = " ";
6963  long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
6964  volatile VALUE pad;
6965  int singlebyte = 1, cr;
6966 
6967  rb_scan_args(argc, argv, "11", &w, &pad);
6968  enc = STR_ENC_GET(str);
6969  width = NUM2LONG(w);
6970  if (argc == 2) {
6971  StringValue(pad);
6972  enc = rb_enc_check(str, pad);
6973  f = RSTRING_PTR(pad);
6974  flen = RSTRING_LEN(pad);
6975  fclen = str_strlen(pad, enc);
6976  singlebyte = single_byte_optimizable(pad);
6977  if (flen == 0 || fclen == 0) {
6978  rb_raise(rb_eArgError, "zero width padding");
6979  }
6980  }
6981  len = str_strlen(str, enc);
6982  if (width < 0 || len >= width) return rb_str_dup(str);
6983  n = width - len;
6984  llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
6985  rlen = n - llen;
6986  cr = ENC_CODERANGE(str);
6987  if (flen > 1) {
6988  llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
6989  rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
6990  }
6991  size = RSTRING_LEN(str);
6992  if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
6993  (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
6994  (len += llen2 + rlen2) >= LONG_MAX - size) {
6995  rb_raise(rb_eArgError, "argument too big");
6996  }
6997  len += size;
6998  res = rb_str_new5(str, 0, len);
6999  p = RSTRING_PTR(res);
7000  if (flen <= 1) {
7001  memset(p, *f, llen);
7002  p += llen;
7003  }
7004  else {
7005  while (llen >= fclen) {
7006  memcpy(p,f,flen);
7007  p += flen;
7008  llen -= fclen;
7009  }
7010  if (llen > 0) {
7011  memcpy(p, f, llen2);
7012  p += llen2;
7013  }
7014  }
7015  memcpy(p, RSTRING_PTR(str), size);
7016  p += size;
7017  if (flen <= 1) {
7018  memset(p, *f, rlen);
7019  p += rlen;
7020  }
7021  else {
7022  while (rlen >= fclen) {
7023  memcpy(p,f,flen);
7024  p += flen;
7025  rlen -= fclen;
7026  }
7027  if (rlen > 0) {
7028  memcpy(p, f, rlen2);
7029  p += rlen2;
7030  }
7031  }
7032  *p = '\0';
7033  STR_SET_LEN(res, p-RSTRING_PTR(res));
7034  OBJ_INFECT(res, str);
7035  if (!NIL_P(pad)) OBJ_INFECT(res, pad);
7036  rb_enc_associate(res, enc);
7037  if (argc == 2)
7038  cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
7039  if (cr != ENC_CODERANGE_BROKEN)
7040  ENC_CODERANGE_SET(res, cr);
7041  return res;
7042 }
7043 
7044 
7045 /*
7046  * call-seq:
7047  * str.ljust(integer, padstr=' ') -> new_str
7048  *
7049  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7050  * <code>String</code> of length <i>integer</i> with <i>str</i> left justified
7051  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7052  *
7053  * "hello".ljust(4) #=> "hello"
7054  * "hello".ljust(20) #=> "hello "
7055  * "hello".ljust(20, '1234') #=> "hello123412341234123"
7056  */
7057 
7058 static VALUE
7060 {
7061  return rb_str_justify(argc, argv, str, 'l');
7062 }
7063 
7064 
7065 /*
7066  * call-seq:
7067  * str.rjust(integer, padstr=' ') -> new_str
7068  *
7069  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7070  * <code>String</code> of length <i>integer</i> with <i>str</i> right justified
7071  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7072  *
7073  * "hello".rjust(4) #=> "hello"
7074  * "hello".rjust(20) #=> " hello"
7075  * "hello".rjust(20, '1234') #=> "123412341234123hello"
7076  */
7077 
7078 static VALUE
7080 {
7081  return rb_str_justify(argc, argv, str, 'r');
7082 }
7083 
7084 
7085 /*
7086  * call-seq:
7087  * str.center(integer, padstr) -> new_str
7088  *
7089  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7090  * <code>String</code> of length <i>integer</i> with <i>str</i> centered and
7091  * padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7092  *
7093  * "hello".center(4) #=> "hello"
7094  * "hello".center(20) #=> " hello "
7095  * "hello".center(20, '123') #=> "1231231hello12312312"
7096  */
7097 
7098 static VALUE
7100 {
7101  return rb_str_justify(argc, argv, str, 'c');
7102 }
7103 
7104 /*
7105  * call-seq:
7106  * str.partition(sep) -> [head, sep, tail]
7107  * str.partition(regexp) -> [head, match, tail]
7108  *
7109  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
7110  * and returns the part before it, the match, and the part
7111  * after it.
7112  * If it is not found, returns two empty strings and <i>str</i>.
7113  *
7114  * "hello".partition("l") #=> ["he", "l", "lo"]
7115  * "hello".partition("x") #=> ["hello", "", ""]
7116  * "hello".partition(/.l/) #=> ["h", "el", "lo"]
7117  */
7118 
7119 static VALUE
7121 {
7122  long pos;
7123  int regex = FALSE;
7124 
7125  if (TYPE(sep) == T_REGEXP) {
7126  pos = rb_reg_search(sep, str, 0, 0);
7127  regex = TRUE;
7128  }
7129  else {
7130  VALUE tmp;
7131 
7132  tmp = rb_check_string_type(sep);
7133  if (NIL_P(tmp)) {
7134  rb_raise(rb_eTypeError, "type mismatch: %s given",
7135  rb_obj_classname(sep));
7136  }
7137  sep = tmp;
7138  pos = rb_str_index(str, sep, 0);
7139  }
7140  if (pos < 0) {
7141  failed:
7142  return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
7143  }
7144  if (regex) {
7145  sep = rb_str_subpat(str, sep, INT2FIX(0));
7146  if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
7147  }
7148  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
7149  sep,
7150  rb_str_subseq(str, pos+RSTRING_LEN(sep),
7151  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
7152 }
7153 
7154 /*
7155  * call-seq:
7156  * str.rpartition(sep) -> [head, sep, tail]
7157  * str.rpartition(regexp) -> [head, match, tail]
7158  *
7159  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
7160  * of the string, and returns the part before it, the match, and the part
7161  * after it.
7162  * If it is not found, returns two empty strings and <i>str</i>.
7163  *
7164  * "hello".rpartition("l") #=> ["hel", "l", "o"]
7165  * "hello".rpartition("x") #=> ["", "", "hello"]
7166  * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
7167  */
7168 
7169 static VALUE
7171 {
7172  long pos = RSTRING_LEN(str);
7173  int regex = FALSE;
7174 
7175  if (TYPE(sep) == T_REGEXP) {
7176  pos = rb_reg_search(sep, str, pos, 1);
7177  regex = TRUE;
7178  }
7179  else {
7180  VALUE tmp;
7181 
7182  tmp = rb_check_string_type(sep);
7183  if (NIL_P(tmp)) {
7184  rb_raise(rb_eTypeError, "type mismatch: %s given",
7185  rb_obj_classname(sep));
7186  }
7187  sep = tmp;
7188  pos = rb_str_sublen(str, pos);
7189  pos = rb_str_rindex(str, sep, pos);
7190  }
7191  if (pos < 0) {
7192  return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
7193  }
7194  if (regex) {
7195  sep = rb_reg_nth_match(0, rb_backref_get());
7196  }
7197  return rb_ary_new3(3, rb_str_substr(str, 0, pos),
7198  sep,
7199  rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
7200 }
7201 
7202 /*
7203  * call-seq:
7204  * str.start_with?([prefix]+) -> true or false
7205  *
7206  * Returns true if <i>str</i> starts with one of the prefixes given.
7207  *
7208  * p "hello".start_with?("hell") #=> true
7209  *
7210  * # returns true if one of the prefixes matches.
7211  * p "hello".start_with?("heaven", "hell") #=> true
7212  * p "hello".start_with?("heaven", "paradise") #=> false
7213  *
7214  *
7215  *
7216  */
7217 
7218 static VALUE
7220 {
7221  int i;
7222 
7223  for (i=0; i<argc; i++) {
7224  VALUE tmp = rb_check_string_type(argv[i]);
7225  if (NIL_P(tmp)) continue;
7226  rb_enc_check(str, tmp);
7227  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
7228  if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
7229  return Qtrue;
7230  }
7231  return Qfalse;
7232 }
7233 
7234 /*
7235  * call-seq:
7236  * str.end_with?([suffix]+) -> true or false
7237  *
7238  * Returns true if <i>str</i> ends with one of the suffixes given.
7239  */
7240 
7241 static VALUE
7243 {
7244  int i;
7245  char *p, *s, *e;
7246  rb_encoding *enc;
7247 
7248  for (i=0; i<argc; i++) {
7249  VALUE tmp = rb_check_string_type(argv[i]);
7250  if (NIL_P(tmp)) continue;
7251  enc = rb_enc_check(str, tmp);
7252  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
7253  p = RSTRING_PTR(str);
7254  e = p + RSTRING_LEN(str);
7255  s = e - RSTRING_LEN(tmp);
7256  if (rb_enc_left_char_head(p, s, e, enc) != s)
7257  continue;
7258  if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
7259  return Qtrue;
7260  }
7261  return Qfalse;
7262 }
7263 
7264 void
7265 rb_str_setter(VALUE val, ID id, VALUE *var)
7266 {
7267  if (!NIL_P(val) && TYPE(val) != T_STRING) {
7268  rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
7269  }
7270  *var = val;
7271 }
7272 
7273 
7274 /*
7275  * call-seq:
7276  * str.force_encoding(encoding) -> str
7277  *
7278  * Changes the encoding to +encoding+ and returns self.
7279  */
7280 
7281 static VALUE
7283 {
7284  str_modifiable(str);
7285  rb_enc_associate(str, rb_to_encoding(enc));
7286  ENC_CODERANGE_CLEAR(str);
7287  return str;
7288 }
7289 
7290 /*
7291  * call-seq:
7292  * str.valid_encoding? -> true or false
7293  *
7294  * Returns true for a string which encoded correctly.
7295  *
7296  * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
7297  * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
7298  * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
7299  */
7300 
7301 static VALUE
7303 {
7304  int cr = rb_enc_str_coderange(str);
7305 
7306  return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
7307 }
7308 
7309 /*
7310  * call-seq:
7311  * str.ascii_only? -> true or false
7312  *
7313  * Returns true for a string which has only ASCII characters.
7314  *
7315  * "abc".force_encoding("UTF-8").ascii_only? #=> true
7316  * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
7317  */
7318 
7319 static VALUE
7321 {
7322  int cr = rb_enc_str_coderange(str);
7323 
7324  return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
7325 }
7326 
7341 VALUE
7342 rb_str_ellipsize(VALUE str, long len)
7343 {
7344  static const char ellipsis[] = "...";
7345  const long ellipsislen = sizeof(ellipsis) - 1;
7346  rb_encoding *const enc = rb_enc_get(str);
7347  const long blen = RSTRING_LEN(str);
7348  const char *const p = RSTRING_PTR(str), *e = p + blen;
7349  VALUE estr, ret = 0;
7350 
7351  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
7352  if (len * rb_enc_mbminlen(enc) >= blen ||
7353  (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
7354  ret = str;
7355  }
7356  else if (len <= ellipsislen ||
7357  !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
7358  if (rb_enc_asciicompat(enc)) {
7359  ret = rb_str_new_with_class(str, ellipsis, len);
7360  rb_enc_associate(ret, enc);
7361  }
7362  else {
7363  estr = rb_usascii_str_new(ellipsis, len);
7364  ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
7365  }
7366  }
7367  else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
7368  rb_str_cat(ret, ellipsis, ellipsislen);
7369  }
7370  else {
7371  estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
7372  rb_enc_from_encoding(enc), 0, Qnil);
7373  rb_str_append(ret, estr);
7374  }
7375  return ret;
7376 }
7377 
7378 /**********************************************************************
7379  * Document-class: Symbol
7380  *
7381  * <code>Symbol</code> objects represent names and some strings
7382  * inside the Ruby
7383  * interpreter. They are generated using the <code>:name</code> and
7384  * <code>:"string"</code> literals
7385  * syntax, and by the various <code>to_sym</code> methods. The same
7386  * <code>Symbol</code> object will be created for a given name or string
7387  * for the duration of a program's execution, regardless of the context
7388  * or meaning of that name. Thus if <code>Fred</code> is a constant in
7389  * one context, a method in another, and a class in a third, the
7390  * <code>Symbol</code> <code>:Fred</code> will be the same object in
7391  * all three contexts.
7392  *
7393  * module One
7394  * class Fred
7395  * end
7396  * $f1 = :Fred
7397  * end
7398  * module Two
7399  * Fred = 1
7400  * $f2 = :Fred
7401  * end
7402  * def Fred()
7403  * end
7404  * $f3 = :Fred
7405  * $f1.object_id #=> 2514190
7406  * $f2.object_id #=> 2514190
7407  * $f3.object_id #=> 2514190
7408  *
7409  */
7410 
7411 
7412 /*
7413  * call-seq:
7414  * sym == obj -> true or false
7415  *
7416  * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
7417  * symbol, returns <code>true</code>.
7418  */
7419 
7420 static VALUE
7421 sym_equal(VALUE sym1, VALUE sym2)
7422 {
7423  if (sym1 == sym2) return Qtrue;
7424  return Qfalse;
7425 }
7426 
7427 
7428 static int
7429 sym_printable(const char *s, const char *send, rb_encoding *enc)
7430 {
7431  while (s < send) {
7432  int n;
7433  int c = rb_enc_codepoint_len(s, send, &n, enc);
7434 
7435  if (!rb_enc_isprint(c, enc)) return FALSE;
7436  s += n;
7437  }
7438  return TRUE;
7439 }
7440 
7441 /*
7442  * call-seq:
7443  * sym.inspect -> string
7444  *
7445  * Returns the representation of <i>sym</i> as a symbol literal.
7446  *
7447  * :fred.inspect #=> ":fred"
7448  */
7449 
7450 static VALUE
7452 {
7453  VALUE str;
7454  ID id = SYM2ID(sym);
7455  rb_encoding *enc;
7456  const char *ptr;
7457  long len;
7458  char *dest;
7460 
7461  if (resenc == NULL) resenc = rb_default_external_encoding();
7462  sym = rb_id2str(id);
7463  enc = STR_ENC_GET(sym);
7464  ptr = RSTRING_PTR(sym);
7465  len = RSTRING_LEN(sym);
7466  if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
7467  !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
7468  str = rb_str_inspect(sym);
7469  len = RSTRING_LEN(str);
7470  rb_str_resize(str, len + 1);
7471  dest = RSTRING_PTR(str);
7472  memmove(dest + 1, dest, len);
7473  dest[0] = ':';
7474  }
7475  else {
7476  char *dest;
7477  str = rb_enc_str_new(0, len + 1, enc);
7478  dest = RSTRING_PTR(str);
7479  dest[0] = ':';
7480  memcpy(dest + 1, ptr, len);
7481  }
7482  return str;
7483 }
7484 
7485 
7486 /*
7487  * call-seq:
7488  * sym.id2name -> string
7489  * sym.to_s -> string
7490  *
7491  * Returns the name or string corresponding to <i>sym</i>.
7492  *
7493  * :fred.id2name #=> "fred"
7494  */
7495 
7496 
7497 VALUE
7499 {
7500  ID id = SYM2ID(sym);
7501 
7502  return str_new3(rb_cString, rb_id2str(id));
7503 }
7504 
7505 
7506 /*
7507  * call-seq:
7508  * sym.to_sym -> sym
7509  * sym.intern -> sym
7510  *
7511  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
7512  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
7513  * in this case.
7514  */
7515 
7516 static VALUE
7518 {
7519  return sym;
7520 }
7521 
7522 static VALUE
7523 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc)
7524 {
7525  VALUE obj;
7526 
7527  if (argc < 1) {
7528  rb_raise(rb_eArgError, "no receiver given");
7529  }
7530  obj = argv[0];
7531  return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc);
7532 }
7533 
7534 /*
7535  * call-seq:
7536  * sym.to_proc
7537  *
7538  * Returns a _Proc_ object which respond to the given method by _sym_.
7539  *
7540  * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
7541  */
7542 
7543 static VALUE
7545 {
7546  static VALUE sym_proc_cache = Qfalse;
7547  enum {SYM_PROC_CACHE_SIZE = 67};
7548  VALUE proc;
7549  long id, index;
7550  VALUE *aryp;
7551 
7552  if (!sym_proc_cache) {
7553  sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
7554  rb_gc_register_mark_object(sym_proc_cache);
7555  rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
7556  }
7557 
7558  id = SYM2ID(sym);
7559  index = (id % SYM_PROC_CACHE_SIZE) << 1;
7560 
7561  aryp = RARRAY_PTR(sym_proc_cache);
7562  if (aryp[index] == sym) {
7563  return aryp[index + 1];
7564  }
7565  else {
7566  proc = rb_proc_new(sym_call, (VALUE)id);
7567  aryp[index] = sym;
7568  aryp[index + 1] = proc;
7569  return proc;
7570  }
7571 }
7572 
7573 /*
7574  * call-seq:
7575  *
7576  * sym.succ
7577  *
7578  * Same as <code>sym.to_s.succ.intern</code>.
7579  */
7580 
7581 static VALUE
7583 {
7584  return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
7585 }
7586 
7587 /*
7588  * call-seq:
7589  *
7590  * str <=> other -> -1, 0, +1 or nil
7591  *
7592  * Compares _sym_ with _other_ in string form.
7593  */
7594 
7595 static VALUE
7597 {
7598  if (!SYMBOL_P(other)) {
7599  return Qnil;
7600  }
7601  return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
7602 }
7603 
7604 /*
7605  * call-seq:
7606  *
7607  * sym.casecmp(other) -> -1, 0, +1 or nil
7608  *
7609  * Case-insensitive version of <code>Symbol#<=></code>.
7610  */
7611 
7612 static VALUE
7614 {
7615  if (!SYMBOL_P(other)) {
7616  return Qnil;
7617  }
7618  return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
7619 }
7620 
7621 /*
7622  * call-seq:
7623  * sym =~ obj -> fixnum or nil
7624  *
7625  * Returns <code>sym.to_s =~ obj</code>.
7626  */
7627 
7628 static VALUE
7630 {
7631  return rb_str_match(rb_sym_to_s(sym), other);
7632 }
7633 
7634 /*
7635  * call-seq:
7636  * sym[idx] -> char
7637  * sym[b, n] -> char
7638  *
7639  * Returns <code>sym.to_s[]</code>.
7640  */
7641 
7642 static VALUE
7644 {
7645  return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
7646 }
7647 
7648 /*
7649  * call-seq:
7650  * sym.length -> integer
7651  *
7652  * Same as <code>sym.to_s.length</code>.
7653  */
7654 
7655 static VALUE
7657 {
7658  return rb_str_length(rb_id2str(SYM2ID(sym)));
7659 }
7660 
7661 /*
7662  * call-seq:
7663  * sym.empty? -> true or false
7664  *
7665  * Returns that _sym_ is :"" or not.
7666  */
7667 
7668 static VALUE
7670 {
7671  return rb_str_empty(rb_id2str(SYM2ID(sym)));
7672 }
7673 
7674 /*
7675  * call-seq:
7676  * sym.upcase -> symbol
7677  *
7678  * Same as <code>sym.to_s.upcase.intern</code>.
7679  */
7680 
7681 static VALUE
7683 {
7685 }
7686 
7687 /*
7688  * call-seq:
7689  * sym.downcase -> symbol
7690  *
7691  * Same as <code>sym.to_s.downcase.intern</code>.
7692  */
7693 
7694 static VALUE
7696 {
7698 }
7699 
7700 /*
7701  * call-seq:
7702  * sym.capitalize -> symbol
7703  *
7704  * Same as <code>sym.to_s.capitalize.intern</code>.
7705  */
7706 
7707 static VALUE
7709 {
7711 }
7712 
7713 /*
7714  * call-seq:
7715  * sym.swapcase -> symbol
7716  *
7717  * Same as <code>sym.to_s.swapcase.intern</code>.
7718  */
7719 
7720 static VALUE
7722 {
7724 }
7725 
7726 /*
7727  * call-seq:
7728  * sym.encoding -> encoding
7729  *
7730  * Returns the Encoding object that represents the encoding of _sym_.
7731  */
7732 
7733 static VALUE
7735 {
7736  return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
7737 }
7738 
7739 ID
7741 {
7742  VALUE tmp;
7743 
7744  switch (TYPE(name)) {
7745  default:
7746  tmp = rb_check_string_type(name);
7747  if (NIL_P(tmp)) {
7748  tmp = rb_inspect(name);
7749  rb_raise(rb_eTypeError, "%s is not a symbol",
7750  RSTRING_PTR(tmp));
7751  }
7752  name = tmp;
7753  /* fall through */
7754  case T_STRING:
7755  name = rb_str_intern(name);
7756  /* fall through */
7757  case T_SYMBOL:
7758  return SYM2ID(name);
7759  }
7760  return Qnil; /* not reached */
7761 }
7762 
7763 /*
7764  * A <code>String</code> object holds and manipulates an arbitrary sequence of
7765  * bytes, typically representing characters. String objects may be created
7766  * using <code>String::new</code> or as literals.
7767  *
7768  * Because of aliasing issues, users of strings should be aware of the methods
7769  * that modify the contents of a <code>String</code> object. Typically,
7770  * methods with names ending in ``!'' modify their receiver, while those
7771  * without a ``!'' return a new <code>String</code>. However, there are
7772  * exceptions, such as <code>String#[]=</code>.
7773  *
7774  */
7775 
7776 void
7778 {
7779 #undef rb_intern
7780 #define rb_intern(str) rb_intern_const(str)
7781 
7782  rb_cString = rb_define_class("String", rb_cObject);
7786  rb_define_method(rb_cString, "initialize", rb_str_init, -1);
7787  rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
7791  rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
7793  rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
7799  rb_define_method(rb_cString, "insert", rb_str_insert, 2);
7800  rb_define_method(rb_cString, "length", rb_str_length, 0);
7802  rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
7803  rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
7810  rb_define_method(rb_cString, "upto", rb_str_upto, -1);
7813  rb_define_method(rb_cString, "replace", rb_str_replace, 1);
7816  rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
7817  rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
7818  rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
7819 
7820  rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
7823  rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
7824  rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
7826 
7827  rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
7828  rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
7829  rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
7830  rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
7831 
7836 
7844  rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
7846  rb_define_method(rb_cString, "concat", rb_str_concat, 1);
7848  rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
7850  rb_define_method(rb_cString, "intern", rb_str_intern, 0);
7851  rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
7853 
7854  rb_define_method(rb_cString, "include?", rb_str_include, 1);
7855  rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
7856  rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
7857 
7859 
7860  rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
7861  rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
7862  rb_define_method(rb_cString, "center", rb_str_center, -1);
7863 
7864  rb_define_method(rb_cString, "sub", rb_str_sub, -1);
7865  rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
7867  rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
7869  rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
7870  rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
7871 
7879 
7882  rb_define_method(rb_cString, "delete", rb_str_delete, -1);
7883  rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
7884  rb_define_method(rb_cString, "count", rb_str_count, -1);
7885 
7890 
7891  rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
7892  rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
7893  rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
7894  rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
7895 
7896  rb_define_method(rb_cString, "sum", rb_str_sum, -1);
7897 
7898  rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
7900 
7901  rb_define_method(rb_cString, "partition", rb_str_partition, 1);
7902  rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
7903 
7904  rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
7905  rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
7906  rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
7908 
7909  id_to_s = rb_intern("to_s");
7910 
7911  rb_fs = Qnil;
7912  rb_define_variable("$;", &rb_fs);
7913  rb_define_variable("$-F", &rb_fs);
7914 
7915  rb_cSymbol = rb_define_class("Symbol", rb_cObject);
7919  rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
7920 
7923  rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
7925  rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
7926  rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
7927  rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
7928  rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
7929  rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
7930  rb_define_method(rb_cSymbol, "next", sym_succ, 0);
7931 
7932  rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
7933  rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
7935 
7936  rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
7937  rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
7938  rb_define_method(rb_cSymbol, "length", sym_length, 0);
7939  rb_define_method(rb_cSymbol, "size", sym_length, 0);
7940  rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
7941  rb_define_method(rb_cSymbol, "match", sym_match, 1);
7942 
7943  rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
7944  rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
7945  rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
7946  rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
7947 
7948  rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
7949 }
7950