Ruby  1.9.3p429(2013-05-15revision40747)
string.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  string.c -
4 
5  $Author: usa $
6  created at: Mon Aug 9 17:12:58 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "ruby/ruby.h"
15 #include "ruby/re.h"
16 #include "ruby/encoding.h"
17 #include "internal.h"
18 #include <assert.h>
19 
20 #define BEG(no) (regs->beg[(no)])
21 #define END(no) (regs->end[(no)])
22 
23 #include <math.h>
24 #include <ctype.h>
25 
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 
30 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
31 
32 #undef rb_str_new_cstr
33 #undef rb_tainted_str_new_cstr
34 #undef rb_usascii_str_new_cstr
35 #undef rb_external_str_new_cstr
36 #undef rb_locale_str_new_cstr
37 #undef rb_str_new2
38 #undef rb_str_new3
39 #undef rb_str_new4
40 #undef rb_str_new5
41 #undef rb_tainted_str_new2
42 #undef rb_usascii_str_new2
43 #undef rb_str_dup_frozen
44 #undef rb_str_buf_new_cstr
45 #undef rb_str_buf_new2
46 #undef rb_str_buf_cat2
47 #undef rb_str_cat2
48 
49 static VALUE rb_str_clear(VALUE str);
50 
53 
54 #define RUBY_MAX_CHAR_LEN 16
55 #define STR_TMPLOCK FL_USER7
56 #define STR_NOEMBED FL_USER1
57 #define STR_SHARED FL_USER2 /* = ELTS_SHARED */
58 #define STR_ASSOC FL_USER3
59 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
60 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
61 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
62 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
63 #define STR_UNSET_NOCAPA(s) do {\
64  if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
65 } while (0)
66 
67 
68 #define STR_SET_NOEMBED(str) do {\
69  FL_SET((str), STR_NOEMBED);\
70  STR_SET_EMBED_LEN((str), 0);\
71 } while (0)
72 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
73 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
74 #define STR_SET_EMBED_LEN(str, n) do { \
75  long tmp_n = (n);\
76  RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
77  RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
78 } while (0)
79 
80 #define STR_SET_LEN(str, n) do { \
81  if (STR_EMBED_P(str)) {\
82  STR_SET_EMBED_LEN((str), (n));\
83  }\
84  else {\
85  RSTRING(str)->as.heap.len = (n);\
86  }\
87 } while (0)
88 
89 #define STR_DEC_LEN(str) do {\
90  if (STR_EMBED_P(str)) {\
91  long n = RSTRING_LEN(str);\
92  n--;\
93  STR_SET_EMBED_LEN((str), n);\
94  }\
95  else {\
96  RSTRING(str)->as.heap.len--;\
97  }\
98 } while (0)
99 
100 #define RESIZE_CAPA(str,capacity) do {\
101  if (STR_EMBED_P(str)) {\
102  if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
103  char *tmp = ALLOC_N(char, (capacity)+1);\
104  memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
105  RSTRING(str)->as.heap.ptr = tmp;\
106  RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
107  STR_SET_NOEMBED(str);\
108  RSTRING(str)->as.heap.aux.capa = (capacity);\
109  }\
110  }\
111  else {\
112  REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
113  if (!STR_NOCAPA_P(str))\
114  RSTRING(str)->as.heap.aux.capa = (capacity);\
115  }\
116 } while (0)
117 
118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
120 
121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
122 
123 static inline int
125 {
126  rb_encoding *enc;
127 
128  /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
129  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
130  return 1;
131 
132  enc = STR_ENC_GET(str);
133  if (rb_enc_mbmaxlen(enc) == 1)
134  return 1;
135 
136  /* Conservative. Possibly single byte.
137  * "\xa1" in Shift_JIS for example. */
138  return 0;
139 }
140 
142 
143 static inline const char *
144 search_nonascii(const char *p, const char *e)
145 {
146 #if SIZEOF_VALUE == 8
147 # define NONASCII_MASK 0x8080808080808080ULL
148 #elif SIZEOF_VALUE == 4
149 # define NONASCII_MASK 0x80808080UL
150 #endif
151 #ifdef NONASCII_MASK
152  if ((int)sizeof(VALUE) * 2 < e - p) {
153  const VALUE *s, *t;
154  const VALUE lowbits = sizeof(VALUE) - 1;
155  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
156  while (p < (const char *)s) {
157  if (!ISASCII(*p))
158  return p;
159  p++;
160  }
161  t = (const VALUE*)(~lowbits & (VALUE)e);
162  while (s < t) {
163  if (*s & NONASCII_MASK) {
164  t = s;
165  break;
166  }
167  s++;
168  }
169  p = (const char *)t;
170  }
171 #endif
172  while (p < e) {
173  if (!ISASCII(*p))
174  return p;
175  p++;
176  }
177  return NULL;
178 }
179 
180 static int
181 coderange_scan(const char *p, long len, rb_encoding *enc)
182 {
183  const char *e = p + len;
184 
185  if (rb_enc_to_index(enc) == 0) {
186  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
187  p = search_nonascii(p, e);
189  }
190 
191  if (rb_enc_asciicompat(enc)) {
192  p = search_nonascii(p, e);
193  if (!p) {
194  return ENC_CODERANGE_7BIT;
195  }
196  while (p < e) {
197  int ret = rb_enc_precise_mbclen(p, e, enc);
198  if (!MBCLEN_CHARFOUND_P(ret)) {
199  return ENC_CODERANGE_BROKEN;
200  }
201  p += MBCLEN_CHARFOUND_LEN(ret);
202  if (p < e) {
203  p = search_nonascii(p, e);
204  if (!p) {
205  return ENC_CODERANGE_VALID;
206  }
207  }
208  }
209  if (e < p) {
210  return ENC_CODERANGE_BROKEN;
211  }
212  return ENC_CODERANGE_VALID;
213  }
214 
215  while (p < e) {
216  int ret = rb_enc_precise_mbclen(p, e, enc);
217 
218  if (!MBCLEN_CHARFOUND_P(ret)) {
219  return ENC_CODERANGE_BROKEN;
220  }
221  p += MBCLEN_CHARFOUND_LEN(ret);
222  }
223  if (e < p) {
224  return ENC_CODERANGE_BROKEN;
225  }
226  return ENC_CODERANGE_VALID;
227 }
228 
229 long
230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
231 {
232  const char *p = s;
233 
234  if (*cr == ENC_CODERANGE_BROKEN)
235  return e - s;
236 
237  if (rb_enc_to_index(enc) == 0) {
238  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
239  p = search_nonascii(p, e);
241  return e - s;
242  }
243  else if (rb_enc_asciicompat(enc)) {
244  p = search_nonascii(p, e);
245  if (!p) {
246  if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
247  return e - s;
248  }
249  while (p < e) {
250  int ret = rb_enc_precise_mbclen(p, e, enc);
251  if (!MBCLEN_CHARFOUND_P(ret)) {
253  return p - s;
254  }
255  p += MBCLEN_CHARFOUND_LEN(ret);
256  if (p < e) {
257  p = search_nonascii(p, e);
258  if (!p) {
259  *cr = ENC_CODERANGE_VALID;
260  return e - s;
261  }
262  }
263  }
265  return p - s;
266  }
267  else {
268  while (p < e) {
269  int ret = rb_enc_precise_mbclen(p, e, enc);
270  if (!MBCLEN_CHARFOUND_P(ret)) {
272  return p - s;
273  }
274  p += MBCLEN_CHARFOUND_LEN(ret);
275  }
277  return p - s;
278  }
279 }
280 
281 static inline void
283 {
284  rb_enc_set_index(str1, ENCODING_GET(str2));
285 }
286 
287 static void
289 {
290  /* this function is designed for copying encoding and coderange
291  * from src to new string "dest" which is made from the part of src.
292  */
293  str_enc_copy(dest, src);
294  switch (ENC_CODERANGE(src)) {
295  case ENC_CODERANGE_7BIT:
297  break;
298  case ENC_CODERANGE_VALID:
299  if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
302  else
304  break;
305  default:
306  if (RSTRING_LEN(dest) == 0) {
307  if (!rb_enc_asciicompat(STR_ENC_GET(src)))
309  else
311  }
312  break;
313  }
314 }
315 
316 static void
318 {
319  str_enc_copy(dest, src);
320  ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
321 }
322 
323 int
325 {
326  int cr = ENC_CODERANGE(str);
327 
328  if (cr == ENC_CODERANGE_UNKNOWN) {
329  rb_encoding *enc = STR_ENC_GET(str);
330  cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
331  ENC_CODERANGE_SET(str, cr);
332  }
333  return cr;
334 }
335 
336 int
338 {
339  rb_encoding *enc = STR_ENC_GET(str);
340 
341  if (!rb_enc_asciicompat(enc))
342  return FALSE;
343  else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
344  return TRUE;
345  return FALSE;
346 }
347 
348 static inline void
349 str_mod_check(VALUE s, const char *p, long len)
350 {
351  if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
352  rb_raise(rb_eRuntimeError, "string modified");
353  }
354 }
355 
356 size_t
358 {
359  if (STR_EMBED_P(str)) {
360  return RSTRING_EMBED_LEN_MAX;
361  }
362  else if (STR_NOCAPA_P(str)) {
363  return RSTRING(str)->as.heap.len;
364  }
365  else {
366  return RSTRING(str)->as.heap.aux.capa;
367  }
368 }
369 
370 static inline VALUE
372 {
373  NEWOBJ(str, struct RString);
374  OBJSETUP(str, klass, T_STRING);
375 
376  str->as.heap.ptr = 0;
377  str->as.heap.len = 0;
378  str->as.heap.aux.capa = 0;
379 
380  return (VALUE)str;
381 }
382 
383 static VALUE
384 str_new(VALUE klass, const char *ptr, long len)
385 {
386  VALUE str;
387 
388  if (len < 0) {
389  rb_raise(rb_eArgError, "negative string size (or size too big)");
390  }
391 
392  str = str_alloc(klass);
393  if (len > RSTRING_EMBED_LEN_MAX) {
394  RSTRING(str)->as.heap.aux.capa = len;
395  RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
396  STR_SET_NOEMBED(str);
397  }
398  else if (len == 0) {
400  }
401  if (ptr) {
402  memcpy(RSTRING_PTR(str), ptr, len);
403  }
404  STR_SET_LEN(str, len);
405  RSTRING_PTR(str)[len] = '\0';
406  return str;
407 }
408 
409 VALUE
410 rb_str_new(const char *ptr, long len)
411 {
412  return str_new(rb_cString, ptr, len);
413 }
414 
415 VALUE
416 rb_usascii_str_new(const char *ptr, long len)
417 {
418  VALUE str = rb_str_new(ptr, len);
420  return str;
421 }
422 
423 VALUE
424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
425 {
426  VALUE str = rb_str_new(ptr, len);
427  rb_enc_associate(str, enc);
428  return str;
429 }
430 
431 VALUE
432 rb_str_new_cstr(const char *ptr)
433 {
434  if (!ptr) {
435  rb_raise(rb_eArgError, "NULL pointer given");
436  }
437  return rb_str_new(ptr, strlen(ptr));
438 }
439 
441 #define rb_str_new2 rb_str_new_cstr
442 
443 VALUE
444 rb_usascii_str_new_cstr(const char *ptr)
445 {
446  VALUE str = rb_str_new2(ptr);
448  return str;
449 }
450 
452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
453 
454 VALUE
455 rb_tainted_str_new(const char *ptr, long len)
456 {
457  VALUE str = rb_str_new(ptr, len);
458 
459  OBJ_TAINT(str);
460  return str;
461 }
462 
463 VALUE
464 rb_tainted_str_new_cstr(const char *ptr)
465 {
466  VALUE str = rb_str_new2(ptr);
467 
468  OBJ_TAINT(str);
469  return str;
470 }
471 
473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
474 
475 VALUE
476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
477 {
478  rb_econv_t *ec;
479  rb_econv_result_t ret;
480  long len;
481  VALUE newstr;
482  const unsigned char *sp;
483  unsigned char *dp;
484 
485  if (!to) return str;
486  if (from == to) return str;
487  if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
488  to == rb_ascii8bit_encoding()) {
489  if (STR_ENC_GET(str) != to) {
490  str = rb_str_dup(str);
491  rb_enc_associate(str, to);
492  }
493  return str;
494  }
495 
496  len = RSTRING_LEN(str);
497  newstr = rb_str_new(0, len);
498 
499  retry:
500  ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
501  if (!ec) return str;
502 
503  sp = (unsigned char*)RSTRING_PTR(str);
504  dp = (unsigned char*)RSTRING_PTR(newstr);
505  ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
506  &dp, (unsigned char*)RSTRING_END(newstr), 0);
507  rb_econv_close(ec);
508  switch (ret) {
510  /* destination buffer short */
511  len = len < 2 ? 2 : len * 2;
512  rb_str_resize(newstr, len);
513  goto retry;
514 
515  case econv_finished:
516  len = dp - (unsigned char*)RSTRING_PTR(newstr);
517  rb_str_set_len(newstr, len);
518  rb_enc_associate(newstr, to);
519  return newstr;
520 
521  default:
522  /* some error, return original */
523  return str;
524  }
525 }
526 
527 VALUE
529 {
530  return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
531 }
532 
533 VALUE
534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
535 {
536  VALUE str;
537 
538  str = rb_tainted_str_new(ptr, len);
539  if (eenc == rb_usascii_encoding() &&
542  return str;
543  }
544  rb_enc_associate(str, eenc);
545  return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
546 }
547 
548 VALUE
549 rb_external_str_new(const char *ptr, long len)
550 {
552 }
553 
554 VALUE
555 rb_external_str_new_cstr(const char *ptr)
556 {
558 }
559 
560 VALUE
561 rb_locale_str_new(const char *ptr, long len)
562 {
564 }
565 
566 VALUE
567 rb_locale_str_new_cstr(const char *ptr)
568 {
570 }
571 
572 VALUE
573 rb_filesystem_str_new(const char *ptr, long len)
574 {
576 }
577 
578 VALUE
580 {
582 }
583 
584 VALUE
586 {
588 }
589 
590 VALUE
592 {
593  return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
594 }
595 
596 VALUE
598 {
599  return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
600 }
601 
602 static VALUE
604 {
605  if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
606  STR_SET_EMBED(str2);
607  memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
608  STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
609  }
610  else {
611  str = rb_str_new_frozen(str);
612  FL_SET(str2, STR_NOEMBED);
613  RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
614  RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
615  RSTRING(str2)->as.heap.aux.shared = str;
616  FL_SET(str2, ELTS_SHARED);
617  }
618  rb_enc_cr_str_exact_copy(str2, str);
619 
620  return str2;
621 }
622 
623 static VALUE
625 {
626  return str_replace_shared(str_alloc(klass), str);
627 }
628 
629 static VALUE
630 str_new3(VALUE klass, VALUE str)
631 {
632  return str_new_shared(klass, str);
633 }
634 
635 VALUE
637 {
638  VALUE str2 = str_new3(rb_obj_class(str), str);
639 
640  OBJ_INFECT(str2, str);
641  return str2;
642 }
643 
645 #define rb_str_new3 rb_str_new_shared
646 
647 static VALUE
648 str_new4(VALUE klass, VALUE str)
649 {
650  VALUE str2;
651 
652  str2 = str_alloc(klass);
653  STR_SET_NOEMBED(str2);
654  RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
655  RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
656  if (STR_SHARED_P(str)) {
657  VALUE shared = RSTRING(str)->as.heap.aux.shared;
658  assert(OBJ_FROZEN(shared));
659  FL_SET(str2, ELTS_SHARED);
660  RSTRING(str2)->as.heap.aux.shared = shared;
661  }
662  else {
663  FL_SET(str, ELTS_SHARED);
664  RSTRING(str)->as.heap.aux.shared = str2;
665  }
666  rb_enc_cr_str_exact_copy(str2, str);
667  OBJ_INFECT(str2, str);
668  return str2;
669 }
670 
671 VALUE
673 {
674  VALUE klass, str;
675 
676  if (OBJ_FROZEN(orig)) return orig;
677  klass = rb_obj_class(orig);
678  if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
679  long ofs;
680  assert(OBJ_FROZEN(str));
681  ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
682  if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
683  (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
684  ENCODING_GET(str) != ENCODING_GET(orig)) {
685  str = str_new3(klass, str);
686  RSTRING(str)->as.heap.ptr += ofs;
687  RSTRING(str)->as.heap.len -= ofs;
688  rb_enc_cr_str_exact_copy(str, orig);
689  OBJ_INFECT(str, orig);
690  }
691  }
692  else if (STR_EMBED_P(orig)) {
693  str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
694  rb_enc_cr_str_exact_copy(str, orig);
695  OBJ_INFECT(str, orig);
696  }
697  else if (STR_ASSOC_P(orig)) {
698  VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
699  FL_UNSET(orig, STR_ASSOC);
700  str = str_new4(klass, orig);
701  FL_SET(str, STR_ASSOC);
702  RSTRING(str)->as.heap.aux.shared = assoc;
703  }
704  else {
705  str = str_new4(klass, orig);
706  }
707  OBJ_FREEZE(str);
708  return str;
709 }
710 
712 #define rb_str_new4 rb_str_new_frozen
713 
714 VALUE
715 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
716 {
717  return str_new(rb_obj_class(obj), ptr, len);
718 }
719 
720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
721  rb_str_new_with_class, (obj, ptr, len))
722 #define rb_str_new5 rb_str_new_with_class
723 
724 static VALUE
725 str_new_empty(VALUE str)
726 {
727  VALUE v = rb_str_new5(str, 0, 0);
728  rb_enc_copy(v, str);
729  OBJ_INFECT(v, str);
730  return v;
731 }
732 
733 #define STR_BUF_MIN_SIZE 128
734 
735 VALUE
736 rb_str_buf_new(long capa)
737 {
738  VALUE str = str_alloc(rb_cString);
739 
740  if (capa < STR_BUF_MIN_SIZE) {
741  capa = STR_BUF_MIN_SIZE;
742  }
743  FL_SET(str, STR_NOEMBED);
744  RSTRING(str)->as.heap.aux.capa = capa;
745  RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
746  RSTRING(str)->as.heap.ptr[0] = '\0';
747 
748  return str;
749 }
750 
751 VALUE
752 rb_str_buf_new_cstr(const char *ptr)
753 {
754  VALUE str;
755  long len = strlen(ptr);
756 
757  str = rb_str_buf_new(len);
758  rb_str_buf_cat(str, ptr, len);
759 
760  return str;
761 }
762 
764 #define rb_str_buf_new2 rb_str_buf_new_cstr
765 
766 VALUE
767 rb_str_tmp_new(long len)
768 {
769  return str_new(0, 0, len);
770 }
771 
772 void *
773 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
774 {
775  VALUE s = rb_str_tmp_new(len);
776  *store = s;
777  return RSTRING_PTR(s);
778 }
779 
780 void
781 rb_free_tmp_buffer(volatile VALUE *store)
782 {
783  VALUE s = *store;
784  *store = 0;
785  if (s) rb_str_clear(s);
786 }
787 
788 void
790 {
791  if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
792  xfree(RSTRING(str)->as.heap.ptr);
793  }
794 }
795 
796 RUBY_FUNC_EXPORTED size_t
798 {
799  if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
800  return RSTRING(str)->as.heap.aux.capa;
801  }
802  else {
803  return 0;
804  }
805 }
806 
807 VALUE
809 {
810  return rb_convert_type(str, T_STRING, "String", "to_str");
811 }
812 
813 static inline void str_discard(VALUE str);
814 
815 void
817 {
818  rb_encoding *enc;
819  int cr;
820  if (str == str2) return;
821  enc = STR_ENC_GET(str2);
822  cr = ENC_CODERANGE(str2);
823  str_discard(str);
824  OBJ_INFECT(str, str2);
825  if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
826  STR_SET_EMBED(str);
827  memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
828  STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
829  rb_enc_associate(str, enc);
830  ENC_CODERANGE_SET(str, cr);
831  return;
832  }
833  STR_SET_NOEMBED(str);
834  STR_UNSET_NOCAPA(str);
835  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
836  RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
837  if (STR_NOCAPA_P(str2)) {
838  FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
839  RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
840  }
841  else {
842  RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
843  }
844  STR_SET_EMBED(str2); /* abandon str2 */
845  RSTRING_PTR(str2)[0] = 0;
846  STR_SET_EMBED_LEN(str2, 0);
847  rb_enc_associate(str, enc);
848  ENC_CODERANGE_SET(str, cr);
849 }
850 
851 static ID id_to_s;
852 
853 VALUE
855 {
856  VALUE str;
857 
858  if (TYPE(obj) == T_STRING) {
859  return obj;
860  }
861  str = rb_funcall(obj, id_to_s, 0);
862  if (TYPE(str) != T_STRING)
863  return rb_any_to_s(obj);
864  if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
865  return str;
866 }
867 
868 static VALUE
870 {
871  long len;
872 
873  len = RSTRING_LEN(str2);
874  if (STR_ASSOC_P(str2)) {
875  str2 = rb_str_new4(str2);
876  }
877  if (STR_SHARED_P(str2)) {
878  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
879  assert(OBJ_FROZEN(shared));
880  STR_SET_NOEMBED(str);
881  RSTRING(str)->as.heap.len = len;
882  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
883  FL_SET(str, ELTS_SHARED);
884  FL_UNSET(str, STR_ASSOC);
885  RSTRING(str)->as.heap.aux.shared = shared;
886  }
887  else {
888  str_replace_shared(str, str2);
889  }
890 
891  OBJ_INFECT(str, str2);
892  rb_enc_cr_str_exact_copy(str, str2);
893  return str;
894 }
895 
896 static VALUE
898 {
899  VALUE dup = str_alloc(klass);
900  str_replace(dup, str);
901  return dup;
902 }
903 
904 VALUE
906 {
907  return str_duplicate(rb_obj_class(str), str);
908 }
909 
910 VALUE
912 {
913  return str_replace(str_alloc(rb_cString), str);
914 }
915 
916 /*
917  * call-seq:
918  * String.new(str="") -> new_str
919  *
920  * Returns a new string object containing a copy of <i>str</i>.
921  */
922 
923 static VALUE
925 {
926  VALUE orig;
927 
928  if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
929  rb_str_replace(str, orig);
930  return str;
931 }
932 
933 static inline long
934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
935 {
936  long c;
937  const char *q;
938 
939  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
940  return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
941  }
942  else if (rb_enc_asciicompat(enc)) {
943  c = 0;
944  if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
945  while (p < e) {
946  if (ISASCII(*p)) {
947  q = search_nonascii(p, e);
948  if (!q)
949  return c + (e - p);
950  c += q - p;
951  p = q;
952  }
953  p += rb_enc_fast_mbclen(p, e, enc);
954  c++;
955  }
956  }
957  else {
958  while (p < e) {
959  if (ISASCII(*p)) {
960  q = search_nonascii(p, e);
961  if (!q)
962  return c + (e - p);
963  c += q - p;
964  p = q;
965  }
966  p += rb_enc_mbclen(p, e, enc);
967  c++;
968  }
969  }
970  return c;
971  }
972 
973  for (c=0; p<e; c++) {
974  p += rb_enc_mbclen(p, e, enc);
975  }
976  return c;
977 }
978 
979 long
980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
981 {
982  return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
983 }
984 
985 long
986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
987 {
988  long c;
989  const char *q;
990  int ret;
991 
992  *cr = 0;
993  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
994  return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
995  }
996  else if (rb_enc_asciicompat(enc)) {
997  c = 0;
998  while (p < e) {
999  if (ISASCII(*p)) {
1000  q = search_nonascii(p, e);
1001  if (!q) {
1002  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1003  return c + (e - p);
1004  }
1005  c += q - p;
1006  p = q;
1007  }
1008  ret = rb_enc_precise_mbclen(p, e, enc);
1009  if (MBCLEN_CHARFOUND_P(ret)) {
1010  *cr |= ENC_CODERANGE_VALID;
1011  p += MBCLEN_CHARFOUND_LEN(ret);
1012  }
1013  else {
1014  *cr = ENC_CODERANGE_BROKEN;
1015  p++;
1016  }
1017  c++;
1018  }
1019  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1020  return c;
1021  }
1022 
1023  for (c=0; p<e; c++) {
1024  ret = rb_enc_precise_mbclen(p, e, enc);
1025  if (MBCLEN_CHARFOUND_P(ret)) {
1026  *cr |= ENC_CODERANGE_VALID;
1027  p += MBCLEN_CHARFOUND_LEN(ret);
1028  }
1029  else {
1030  *cr = ENC_CODERANGE_BROKEN;
1031  if (p + rb_enc_mbminlen(enc) <= e)
1032  p += rb_enc_mbminlen(enc);
1033  else
1034  p = e;
1035  }
1036  }
1037  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1038  return c;
1039 }
1040 
1041 #ifdef NONASCII_MASK
1042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1043 
1044 /*
1045  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1046  * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
1047  * Therefore, following pseudo code can detect UTF-8 leading byte.
1048  *
1049  * if (!(byte & 0x80))
1050  * byte |= 0x40; // turn on bit6
1051  * return ((byte>>6) & 1); // bit6 represent it's leading byte or not.
1052  *
1053  * This function calculate every bytes in the argument word `s'
1054  * using the above logic concurrently. and gather every bytes result.
1055  */
1056 static inline VALUE
1057 count_utf8_lead_bytes_with_word(const VALUE *s)
1058 {
1059  VALUE d = *s;
1060 
1061  /* Transform into bit0 represent UTF-8 leading or not. */
1062  d |= ~(d>>1);
1063  d >>= 6;
1064  d &= NONASCII_MASK >> 7;
1065 
1066  /* Gather every bytes. */
1067  d += (d>>8);
1068  d += (d>>16);
1069 #if SIZEOF_VALUE == 8
1070  d += (d>>32);
1071 #endif
1072  return (d&0xF);
1073 }
1074 #endif
1075 
1076 static long
1078 {
1079  const char *p, *e;
1080  long n;
1081  int cr;
1082 
1083  if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1084  if (!enc) enc = STR_ENC_GET(str);
1085  p = RSTRING_PTR(str);
1086  e = RSTRING_END(str);
1087  cr = ENC_CODERANGE(str);
1088 #ifdef NONASCII_MASK
1089  if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1090  enc == rb_utf8_encoding()) {
1091 
1092  VALUE len = 0;
1093  if ((int)sizeof(VALUE) * 2 < e - p) {
1094  const VALUE *s, *t;
1095  const VALUE lowbits = sizeof(VALUE) - 1;
1096  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1097  t = (const VALUE*)(~lowbits & (VALUE)e);
1098  while (p < (const char *)s) {
1099  if (is_utf8_lead_byte(*p)) len++;
1100  p++;
1101  }
1102  while (s < t) {
1103  len += count_utf8_lead_bytes_with_word(s);
1104  s++;
1105  }
1106  p = (const char *)s;
1107  }
1108  while (p < e) {
1109  if (is_utf8_lead_byte(*p)) len++;
1110  p++;
1111  }
1112  return (long)len;
1113  }
1114 #endif
1115  n = rb_enc_strlen_cr(p, e, enc, &cr);
1116  if (cr) {
1117  ENC_CODERANGE_SET(str, cr);
1118  }
1119  return n;
1120 }
1121 
1122 long
1124 {
1125  return str_strlen(str, STR_ENC_GET(str));
1126 }
1127 
1128 /*
1129  * call-seq:
1130  * str.length -> integer
1131  * str.size -> integer
1132  *
1133  * Returns the character length of <i>str</i>.
1134  */
1135 
1136 VALUE
1138 {
1139  long len;
1140 
1141  len = str_strlen(str, STR_ENC_GET(str));
1142  return LONG2NUM(len);
1143 }
1144 
1145 /*
1146  * call-seq:
1147  * str.bytesize -> integer
1148  *
1149  * Returns the length of <i>str</i> in bytes.
1150  */
1151 
1152 static VALUE
1154 {
1155  return LONG2NUM(RSTRING_LEN(str));
1156 }
1157 
1158 /*
1159  * call-seq:
1160  * str.empty? -> true or false
1161  *
1162  * Returns <code>true</code> if <i>str</i> has a length of zero.
1163  *
1164  * "hello".empty? #=> false
1165  * "".empty? #=> true
1166  */
1167 
1168 static VALUE
1170 {
1171  if (RSTRING_LEN(str) == 0)
1172  return Qtrue;
1173  return Qfalse;
1174 }
1175 
1176 /*
1177  * call-seq:
1178  * str + other_str -> new_str
1179  *
1180  * Concatenation---Returns a new <code>String</code> containing
1181  * <i>other_str</i> concatenated to <i>str</i>.
1182  *
1183  * "Hello from " + self.to_s #=> "Hello from main"
1184  */
1185 
1186 VALUE
1188 {
1189  VALUE str3;
1190  rb_encoding *enc;
1191 
1192  StringValue(str2);
1193  enc = rb_enc_check(str1, str2);
1194  str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
1195  memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
1196  memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
1197  RSTRING_PTR(str2), RSTRING_LEN(str2));
1198  RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
1199 
1200  if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
1201  OBJ_TAINT(str3);
1204  return str3;
1205 }
1206 
1207 /*
1208  * call-seq:
1209  * str * integer -> new_str
1210  *
1211  * Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
1212  * the receiver.
1213  *
1214  * "Ho! " * 3 #=> "Ho! Ho! Ho! "
1215  */
1216 
1217 VALUE
1219 {
1220  VALUE str2;
1221  long n, len;
1222  char *ptr2;
1223 
1224  len = NUM2LONG(times);
1225  if (len < 0) {
1226  rb_raise(rb_eArgError, "negative argument");
1227  }
1228  if (len && LONG_MAX/len < RSTRING_LEN(str)) {
1229  rb_raise(rb_eArgError, "argument too big");
1230  }
1231 
1232  str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
1233  ptr2 = RSTRING_PTR(str2);
1234  if (len) {
1235  n = RSTRING_LEN(str);
1236  memcpy(ptr2, RSTRING_PTR(str), n);
1237  while (n <= len/2) {
1238  memcpy(ptr2 + n, ptr2, n);
1239  n *= 2;
1240  }
1241  memcpy(ptr2 + n, ptr2, len-n);
1242  }
1243  ptr2[RSTRING_LEN(str2)] = '\0';
1244  OBJ_INFECT(str2, str);
1245  rb_enc_cr_str_copy_for_substr(str2, str);
1246 
1247  return str2;
1248 }
1249 
1250 /*
1251  * call-seq:
1252  * str % arg -> new_str
1253  *
1254  * Format---Uses <i>str</i> as a format specification, and returns the result
1255  * of applying it to <i>arg</i>. If the format specification contains more than
1256  * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
1257  * containing the values to be substituted. See <code>Kernel::sprintf</code> for
1258  * details of the format string.
1259  *
1260  * "%05d" % 123 #=> "00123"
1261  * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6"
1262  * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
1263  */
1264 
1265 static VALUE
1267 {
1268  volatile VALUE tmp = rb_check_array_type(arg);
1269 
1270  if (!NIL_P(tmp)) {
1271  return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
1272  }
1273  return rb_str_format(1, &arg, str);
1274 }
1275 
1276 static inline void
1278 {
1279  if (FL_TEST(str, STR_TMPLOCK)) {
1280  rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1281  }
1282  rb_check_frozen(str);
1283  if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
1284  rb_raise(rb_eSecurityError, "Insecure: can't modify string");
1285 }
1286 
1287 static inline int
1289 {
1290  str_modifiable(str);
1291  if (!STR_SHARED_P(str)) return 1;
1292  if (STR_EMBED_P(str)) return 1;
1293  return 0;
1294 }
1295 
1296 static void
1298 {
1299  char *ptr;
1300  long len = RSTRING_LEN(str);
1301  long capa = len + expand;
1302 
1303  if (len > capa) len = capa;
1304  ptr = ALLOC_N(char, capa + 1);
1305  if (RSTRING_PTR(str)) {
1306  memcpy(ptr, RSTRING_PTR(str), len);
1307  }
1308  STR_SET_NOEMBED(str);
1309  STR_UNSET_NOCAPA(str);
1310  ptr[len] = 0;
1311  RSTRING(str)->as.heap.ptr = ptr;
1312  RSTRING(str)->as.heap.len = len;
1313  RSTRING(str)->as.heap.aux.capa = capa;
1314 }
1315 
1316 #define str_make_independent(str) str_make_independent_expand((str), 0L)
1317 
1318 void
1320 {
1321  if (!str_independent(str))
1322  str_make_independent(str);
1323  ENC_CODERANGE_CLEAR(str);
1324 }
1325 
1326 void
1327 rb_str_modify_expand(VALUE str, long expand)
1328 {
1329  if (expand < 0) {
1330  rb_raise(rb_eArgError, "negative expanding string size");
1331  }
1332  if (!str_independent(str)) {
1333  str_make_independent_expand(str, expand);
1334  }
1335  else if (expand > 0) {
1336  long len = RSTRING_LEN(str);
1337  long capa = len + expand;
1338  if (!STR_EMBED_P(str)) {
1339  REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
1340  RSTRING(str)->as.heap.aux.capa = capa;
1341  }
1342  else if (capa > RSTRING_EMBED_LEN_MAX) {
1343  str_make_independent_expand(str, expand);
1344  }
1345  }
1346  ENC_CODERANGE_CLEAR(str);
1347 }
1348 
1349 /* As rb_str_modify(), but don't clear coderange */
1350 static void
1352 {
1353  if (!str_independent(str))
1354  str_make_independent(str);
1355  if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
1356  /* Force re-scan later */
1357  ENC_CODERANGE_CLEAR(str);
1358 }
1359 
1360 static inline void
1362 {
1363  str_modifiable(str);
1364  if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
1365  xfree(RSTRING_PTR(str));
1366  RSTRING(str)->as.heap.ptr = 0;
1367  RSTRING(str)->as.heap.len = 0;
1368  }
1369 }
1370 
1371 void
1373 {
1374  /* sanity check */
1375  rb_check_frozen(str);
1376  if (STR_ASSOC_P(str)) {
1377  /* already associated */
1378  rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1379  }
1380  else {
1381  if (STR_SHARED_P(str)) {
1382  VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1383  str_make_independent(str);
1384  if (STR_ASSOC_P(assoc)) {
1385  assoc = RSTRING(assoc)->as.heap.aux.shared;
1386  rb_ary_concat(assoc, add);
1387  add = assoc;
1388  }
1389  }
1390  else if (STR_EMBED_P(str)) {
1391  str_make_independent(str);
1392  }
1393  else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1394  RESIZE_CAPA(str, RSTRING_LEN(str));
1395  }
1396  FL_SET(str, STR_ASSOC);
1397  RBASIC(add)->klass = 0;
1398  RSTRING(str)->as.heap.aux.shared = add;
1399  }
1400 }
1401 
1402 VALUE
1404 {
1405  if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1406  if (STR_ASSOC_P(str)) {
1407  return RSTRING(str)->as.heap.aux.shared;
1408  }
1409  return Qfalse;
1410 }
1411 
1412 VALUE
1413 rb_string_value(volatile VALUE *ptr)
1414 {
1415  VALUE s = *ptr;
1416  if (TYPE(s) != T_STRING) {
1417  s = rb_str_to_str(s);
1418  *ptr = s;
1419  }
1420  return s;
1421 }
1422 
1423 char *
1425 {
1426  VALUE str = rb_string_value(ptr);
1427  return RSTRING_PTR(str);
1428 }
1429 
1430 char *
1432 {
1433  VALUE str = rb_string_value(ptr);
1434  char *s = RSTRING_PTR(str);
1435  long len = RSTRING_LEN(str);
1436 
1437  if (!s || memchr(s, 0, len)) {
1438  rb_raise(rb_eArgError, "string contains null byte");
1439  }
1440  if (s[len]) {
1441  rb_str_modify(str);
1442  s = RSTRING_PTR(str);
1443  s[RSTRING_LEN(str)] = 0;
1444  }
1445  return s;
1446 }
1447 
1448 VALUE
1450 {
1451  str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1452  return str;
1453 }
1454 
1455 /*
1456  * call-seq:
1457  * String.try_convert(obj) -> string or nil
1458  *
1459  * Try to convert <i>obj</i> into a String, using to_str method.
1460  * Returns converted string or nil if <i>obj</i> cannot be converted
1461  * for any reason.
1462  *
1463  * String.try_convert("str") #=> "str"
1464  * String.try_convert(/re/) #=> nil
1465  */
1466 static VALUE
1468 {
1469  return rb_check_string_type(str);
1470 }
1471 
1472 static char*
1473 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
1474 {
1475  long nth = *nthp;
1476  if (rb_enc_mbmaxlen(enc) == 1) {
1477  p += nth;
1478  }
1479  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1480  p += nth * rb_enc_mbmaxlen(enc);
1481  }
1482  else if (rb_enc_asciicompat(enc)) {
1483  const char *p2, *e2;
1484  int n;
1485 
1486  while (p < e && 0 < nth) {
1487  e2 = p + nth;
1488  if (e < e2) {
1489  *nthp = nth;
1490  return (char *)e;
1491  }
1492  if (ISASCII(*p)) {
1493  p2 = search_nonascii(p, e2);
1494  if (!p2) {
1495  *nthp = nth;
1496  return (char *)e2;
1497  }
1498  nth -= p2 - p;
1499  p = p2;
1500  }
1501  n = rb_enc_mbclen(p, e, enc);
1502  p += n;
1503  nth--;
1504  }
1505  *nthp = nth;
1506  if (nth != 0) {
1507  return (char *)e;
1508  }
1509  return (char *)p;
1510  }
1511  else {
1512  while (p < e && nth--) {
1513  p += rb_enc_mbclen(p, e, enc);
1514  }
1515  }
1516  if (p > e) p = e;
1517  *nthp = nth;
1518  return (char*)p;
1519 }
1520 
1521 char*
1522 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
1523 {
1524  return str_nth_len(p, e, &nth, enc);
1525 }
1526 
1527 static char*
1528 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1529 {
1530  if (singlebyte)
1531  p += nth;
1532  else {
1533  p = str_nth_len(p, e, &nth, enc);
1534  }
1535  if (!p) return 0;
1536  if (p > e) p = e;
1537  return (char *)p;
1538 }
1539 
1540 /* char offset to byte offset */
1541 static long
1542 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
1543 {
1544  const char *pp = str_nth(p, e, nth, enc, singlebyte);
1545  if (!pp) return e - p;
1546  return pp - p;
1547 }
1548 
1549 long
1550 rb_str_offset(VALUE str, long pos)
1551 {
1552  return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
1554 }
1555 
1556 #ifdef NONASCII_MASK
1557 static char *
1558 str_utf8_nth(const char *p, const char *e, long *nthp)
1559 {
1560  long nth = *nthp;
1561  if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
1562  const VALUE *s, *t;
1563  const VALUE lowbits = sizeof(VALUE) - 1;
1564  s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1565  t = (const VALUE*)(~lowbits & (VALUE)e);
1566  while (p < (const char *)s) {
1567  if (is_utf8_lead_byte(*p)) nth--;
1568  p++;
1569  }
1570  do {
1571  nth -= count_utf8_lead_bytes_with_word(s);
1572  s++;
1573  } while (s < t && (int)sizeof(VALUE) <= nth);
1574  p = (char *)s;
1575  }
1576  while (p < e) {
1577  if (is_utf8_lead_byte(*p)) {
1578  if (nth == 0) break;
1579  nth--;
1580  }
1581  p++;
1582  }
1583  *nthp = nth;
1584  return (char *)p;
1585 }
1586 
1587 static long
1588 str_utf8_offset(const char *p, const char *e, long nth)
1589 {
1590  const char *pp = str_utf8_nth(p, e, &nth);
1591  return pp - p;
1592 }
1593 #endif
1594 
1595 /* byte offset to char offset */
1596 long
1597 rb_str_sublen(VALUE str, long pos)
1598 {
1599  if (single_byte_optimizable(str) || pos < 0)
1600  return pos;
1601  else {
1602  char *p = RSTRING_PTR(str);
1603  return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
1604  }
1605 }
1606 
1607 VALUE
1608 rb_str_subseq(VALUE str, long beg, long len)
1609 {
1610  VALUE str2;
1611 
1612  if (RSTRING_LEN(str) == beg + len &&
1613  RSTRING_EMBED_LEN_MAX < len) {
1614  str2 = rb_str_new_shared(rb_str_new_frozen(str));
1615  rb_str_drop_bytes(str2, beg);
1616  }
1617  else {
1618  str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1619  }
1620 
1621  rb_enc_cr_str_copy_for_substr(str2, str);
1622  OBJ_INFECT(str2, str);
1623 
1624  return str2;
1625 }
1626 
1627 VALUE
1628 rb_str_substr(VALUE str, long beg, long len)
1629 {
1630  rb_encoding *enc = STR_ENC_GET(str);
1631  VALUE str2;
1632  char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
1633 
1634  if (len < 0) return Qnil;
1635  if (!RSTRING_LEN(str)) {
1636  len = 0;
1637  }
1638  if (single_byte_optimizable(str)) {
1639  if (beg > RSTRING_LEN(str)) return Qnil;
1640  if (beg < 0) {
1641  beg += RSTRING_LEN(str);
1642  if (beg < 0) return Qnil;
1643  }
1644  if (beg + len > RSTRING_LEN(str))
1645  len = RSTRING_LEN(str) - beg;
1646  if (len <= 0) {
1647  len = 0;
1648  p = 0;
1649  }
1650  else
1651  p = s + beg;
1652  goto sub;
1653  }
1654  if (beg < 0) {
1655  if (len > -beg) len = -beg;
1656  if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1657  beg = -beg;
1658  while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
1659  p = e;
1660  if (!p) return Qnil;
1661  while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
1662  if (!p) return Qnil;
1663  len = e - p;
1664  goto sub;
1665  }
1666  else {
1667  beg += str_strlen(str, enc);
1668  if (beg < 0) return Qnil;
1669  }
1670  }
1671  else if (beg > 0 && beg > RSTRING_LEN(str)) {
1672  return Qnil;
1673  }
1674  if (len == 0) {
1675  if (beg > str_strlen(str, enc)) return Qnil;
1676  p = 0;
1677  }
1678 #ifdef NONASCII_MASK
1679  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1680  enc == rb_utf8_encoding()) {
1681  p = str_utf8_nth(s, e, &beg);
1682  if (beg > 0) return Qnil;
1683  len = str_utf8_offset(p, e, len);
1684  }
1685 #endif
1686  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1687  int char_sz = rb_enc_mbmaxlen(enc);
1688 
1689  p = s + beg * char_sz;
1690  if (p > e) {
1691  return Qnil;
1692  }
1693  else if (len * char_sz > e - p)
1694  len = e - p;
1695  else
1696  len *= char_sz;
1697  }
1698  else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
1699  if (beg > 0) return Qnil;
1700  len = 0;
1701  }
1702  else {
1703  len = str_offset(p, e, len, enc, 0);
1704  }
1705  sub:
1706  if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
1707  str2 = rb_str_new4(str);
1708  str2 = str_new3(rb_obj_class(str2), str2);
1709  RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1710  RSTRING(str2)->as.heap.len = len;
1711  }
1712  else {
1713  str2 = rb_str_new5(str, p, len);
1714  rb_enc_cr_str_copy_for_substr(str2, str);
1715  OBJ_INFECT(str2, str);
1716  }
1717 
1718  return str2;
1719 }
1720 
1721 VALUE
1723 {
1724  if (STR_ASSOC_P(str)) {
1725  VALUE ary = RSTRING(str)->as.heap.aux.shared;
1726  OBJ_FREEZE(ary);
1727  }
1728  return rb_obj_freeze(str);
1729 }
1730 
1732 #define rb_str_dup_frozen rb_str_new_frozen
1733 
1734 VALUE
1735 rb_str_locktmp(VALUE str)
1736 {
1737  if (FL_TEST(str, STR_TMPLOCK)) {
1738  rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1739  }
1740  FL_SET(str, STR_TMPLOCK);
1741  return str;
1742 }
1743 
1744 VALUE
1746 {
1747  if (!FL_TEST(str, STR_TMPLOCK)) {
1748  rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1749  }
1750  FL_UNSET(str, STR_TMPLOCK);
1751  return str;
1752 }
1753 
1754 void
1755 rb_str_set_len(VALUE str, long len)
1756 {
1757  long capa;
1758 
1759  str_modifiable(str);
1760  if (STR_SHARED_P(str)) {
1761  rb_raise(rb_eRuntimeError, "can't set length of shared string");
1762  }
1763  if (len > (capa = (long)rb_str_capacity(str))) {
1764  rb_bug("probable buffer overflow: %ld for %ld", len, capa);
1765  }
1766  STR_SET_LEN(str, len);
1767  RSTRING_PTR(str)[len] = '\0';
1768 }
1769 
1770 VALUE
1771 rb_str_resize(VALUE str, long len)
1772 {
1773  long slen;
1774  int independent;
1775 
1776  if (len < 0) {
1777  rb_raise(rb_eArgError, "negative string size (or size too big)");
1778  }
1779 
1780  independent = str_independent(str);
1781  ENC_CODERANGE_CLEAR(str);
1782  slen = RSTRING_LEN(str);
1783  if (len != slen) {
1784  if (STR_EMBED_P(str)) {
1785  if (len <= RSTRING_EMBED_LEN_MAX) {
1786  STR_SET_EMBED_LEN(str, len);
1787  RSTRING(str)->as.ary[len] = '\0';
1788  return str;
1789  }
1790  str_make_independent_expand(str, len - slen);
1791  STR_SET_NOEMBED(str);
1792  }
1793  else if (len <= RSTRING_EMBED_LEN_MAX) {
1794  char *ptr = RSTRING(str)->as.heap.ptr;
1795  STR_SET_EMBED(str);
1796  if (slen > len) slen = len;
1797  if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
1798  RSTRING(str)->as.ary[len] = '\0';
1799  STR_SET_EMBED_LEN(str, len);
1800  if (independent) xfree(ptr);
1801  return str;
1802  }
1803  else if (!independent) {
1804  str_make_independent_expand(str, len - slen);
1805  }
1806  else if (slen < len || slen - len > 1024) {
1807  REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1808  }
1809  if (!STR_NOCAPA_P(str)) {
1810  RSTRING(str)->as.heap.aux.capa = len;
1811  }
1812  RSTRING(str)->as.heap.len = len;
1813  RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */
1814  }
1815  return str;
1816 }
1817 
1818 static VALUE
1819 str_buf_cat(VALUE str, const char *ptr, long len)
1820 {
1821  long capa, total, off = -1;
1822 
1823  if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1824  off = ptr - RSTRING_PTR(str);
1825  }
1826  rb_str_modify(str);
1827  if (len == 0) return 0;
1828  if (STR_ASSOC_P(str)) {
1829  FL_UNSET(str, STR_ASSOC);
1830  capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1831  }
1832  else if (STR_EMBED_P(str)) {
1833  capa = RSTRING_EMBED_LEN_MAX;
1834  }
1835  else {
1836  capa = RSTRING(str)->as.heap.aux.capa;
1837  }
1838  if (RSTRING_LEN(str) >= LONG_MAX - len) {
1839  rb_raise(rb_eArgError, "string sizes too big");
1840  }
1841  total = RSTRING_LEN(str)+len;
1842  if (capa <= total) {
1843  while (total > capa) {
1844  if (capa + 1 >= LONG_MAX / 2) {
1845  capa = (total + 4095) / 4096;
1846  break;
1847  }
1848  capa = (capa + 1) * 2;
1849  }
1850  RESIZE_CAPA(str, capa);
1851  }
1852  if (off != -1) {
1853  ptr = RSTRING_PTR(str) + off;
1854  }
1855  memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1856  STR_SET_LEN(str, total);
1857  RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1858 
1859  return str;
1860 }
1861 
1862 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
1863 
1864 VALUE
1865 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1866 {
1867  if (len == 0) return str;
1868  if (len < 0) {
1869  rb_raise(rb_eArgError, "negative string size (or size too big)");
1870  }
1871  return str_buf_cat(str, ptr, len);
1872 }
1873 
1874 VALUE
1875 rb_str_buf_cat2(VALUE str, const char *ptr)
1876 {
1877  return rb_str_buf_cat(str, ptr, strlen(ptr));
1878 }
1879 
1880 VALUE
1881 rb_str_cat(VALUE str, const char *ptr, long len)
1882 {
1883  if (len < 0) {
1884  rb_raise(rb_eArgError, "negative string size (or size too big)");
1885  }
1886  if (STR_ASSOC_P(str)) {
1887  char *p;
1888  rb_str_modify_expand(str, len);
1889  p = RSTRING(str)->as.heap.ptr;
1890  memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
1891  len = RSTRING(str)->as.heap.len += len;
1892  p[len] = '\0'; /* sentinel */
1893  return str;
1894  }
1895 
1896  return rb_str_buf_cat(str, ptr, len);
1897 }
1898 
1899 VALUE
1900 rb_str_cat2(VALUE str, const char *ptr)
1901 {
1902  return rb_str_cat(str, ptr, strlen(ptr));
1903 }
1904 
1905 static VALUE
1906 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1907  int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1908 {
1909  int str_encindex = ENCODING_GET(str);
1910  int res_encindex;
1911  int str_cr, res_cr;
1912 
1913  str_cr = ENC_CODERANGE(str);
1914 
1915  if (str_encindex == ptr_encindex) {
1916  if (str_cr == ENC_CODERANGE_UNKNOWN)
1917  ptr_cr = ENC_CODERANGE_UNKNOWN;
1918  else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1919  ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1920  }
1921  }
1922  else {
1923  rb_encoding *str_enc = rb_enc_from_index(str_encindex);
1924  rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
1925  if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
1926  if (len == 0)
1927  return str;
1928  if (RSTRING_LEN(str) == 0) {
1929  rb_str_buf_cat(str, ptr, len);
1930  ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
1931  return str;
1932  }
1933  goto incompatible;
1934  }
1935  if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1936  ptr_cr = coderange_scan(ptr, len, ptr_enc);
1937  }
1938  if (str_cr == ENC_CODERANGE_UNKNOWN) {
1939  if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
1940  str_cr = rb_enc_str_coderange(str);
1941  }
1942  }
1943  }
1944  if (ptr_cr_ret)
1945  *ptr_cr_ret = ptr_cr;
1946 
1947  if (str_encindex != ptr_encindex &&
1948  str_cr != ENC_CODERANGE_7BIT &&
1949  ptr_cr != ENC_CODERANGE_7BIT) {
1950  incompatible:
1951  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
1952  rb_enc_name(rb_enc_from_index(str_encindex)),
1953  rb_enc_name(rb_enc_from_index(ptr_encindex)));
1954  }
1955 
1956  if (str_cr == ENC_CODERANGE_UNKNOWN) {
1957  res_encindex = str_encindex;
1958  res_cr = ENC_CODERANGE_UNKNOWN;
1959  }
1960  else if (str_cr == ENC_CODERANGE_7BIT) {
1961  if (ptr_cr == ENC_CODERANGE_7BIT) {
1962  res_encindex = str_encindex;
1963  res_cr = ENC_CODERANGE_7BIT;
1964  }
1965  else {
1966  res_encindex = ptr_encindex;
1967  res_cr = ptr_cr;
1968  }
1969  }
1970  else if (str_cr == ENC_CODERANGE_VALID) {
1971  res_encindex = str_encindex;
1972  if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
1973  res_cr = str_cr;
1974  else
1975  res_cr = ptr_cr;
1976  }
1977  else { /* str_cr == ENC_CODERANGE_BROKEN */
1978  res_encindex = str_encindex;
1979  res_cr = str_cr;
1980  if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
1981  }
1982 
1983  if (len < 0) {
1984  rb_raise(rb_eArgError, "negative string size (or size too big)");
1985  }
1986  str_buf_cat(str, ptr, len);
1987  ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1988  return str;
1989 }
1990 
1991 VALUE
1992 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
1993 {
1994  return rb_enc_cr_str_buf_cat(str, ptr, len,
1996 }
1997 
1998 VALUE
1999 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2000 {
2001  /* ptr must reference NUL terminated ASCII string. */
2002  int encindex = ENCODING_GET(str);
2003  rb_encoding *enc = rb_enc_from_index(encindex);
2004  if (rb_enc_asciicompat(enc)) {
2005  return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2006  encindex, ENC_CODERANGE_7BIT, 0);
2007  }
2008  else {
2009  char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2010  while (*ptr) {
2011  unsigned int c = (unsigned char)*ptr;
2012  int len = rb_enc_codelen(c, enc);
2013  rb_enc_mbcput(c, buf, enc);
2014  rb_enc_cr_str_buf_cat(str, buf, len,
2015  encindex, ENC_CODERANGE_VALID, 0);
2016  ptr++;
2017  }
2018  return str;
2019  }
2020 }
2021 
2022 VALUE
2024 {
2025  int str2_cr;
2026 
2027  str2_cr = ENC_CODERANGE(str2);
2028 
2029  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2030  ENCODING_GET(str2), str2_cr, &str2_cr);
2031 
2032  OBJ_INFECT(str, str2);
2033  ENC_CODERANGE_SET(str2, str2_cr);
2034 
2035  return str;
2036 }
2037 
2038 VALUE
2040 {
2041  rb_encoding *enc;
2042  int cr, cr2;
2043  long len2;
2044 
2045  StringValue(str2);
2046  if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
2047  long len = RSTRING_LEN(str) + len2;
2048  enc = rb_enc_check(str, str2);
2049  cr = ENC_CODERANGE(str);
2050  if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
2051  rb_str_modify_expand(str, len2);
2052  memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
2053  RSTRING_PTR(str2), len2+1);
2054  RSTRING(str)->as.heap.len = len;
2055  rb_enc_associate(str, enc);
2056  ENC_CODERANGE_SET(str, cr);
2057  OBJ_INFECT(str, str2);
2058  return str;
2059  }
2060  return rb_str_buf_append(str, str2);
2061 }
2062 
2063 /*
2064  * call-seq:
2065  * str << integer -> str
2066  * str.concat(integer) -> str
2067  * str << obj -> str
2068  * str.concat(obj) -> str
2069  *
2070  * Append---Concatenates the given object to <i>str</i>. If the object is a
2071  * <code>Integer</code>, it is considered as a codepoint, and is converted
2072  * to a character before concatenation.
2073  *
2074  * a = "hello "
2075  * a << "world" #=> "hello world"
2076  * a.concat(33) #=> "hello world!"
2077  */
2078 
2079 VALUE
2081 {
2082  unsigned int code;
2083  rb_encoding *enc = STR_ENC_GET(str1);
2084 
2085  if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
2086  if (rb_num_to_uint(str2, &code) == 0) {
2087  }
2088  else if (FIXNUM_P(str2)) {
2089  rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
2090  }
2091  else {
2092  rb_raise(rb_eRangeError, "bignum out of char range");
2093  }
2094  }
2095  else {
2096  return rb_str_append(str1, str2);
2097  }
2098 
2099  if (enc == rb_usascii_encoding()) {
2100  /* US-ASCII automatically extended to ASCII-8BIT */
2101  char buf[1];
2102  buf[0] = (char)code;
2103  if (code > 0xFF) {
2104  rb_raise(rb_eRangeError, "%u out of char range", code);
2105  }
2106  rb_str_cat(str1, buf, 1);
2107  if (code > 127) {
2110  }
2111  }
2112  else {
2113  long pos = RSTRING_LEN(str1);
2114  int cr = ENC_CODERANGE(str1);
2115  int len;
2116  char *buf;
2117 
2118  switch (len = rb_enc_codelen(code, enc)) {
2120  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2121  break;
2123  case 0:
2124  rb_raise(rb_eRangeError, "%u out of char range", code);
2125  break;
2126  }
2127  buf = ALLOCA_N(char, len + 1);
2128  rb_enc_mbcput(code, buf, enc);
2129  if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
2130  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2131  }
2132  rb_str_resize(str1, pos+len);
2133  strncpy(RSTRING_PTR(str1) + pos, buf, len);
2134  if (cr == ENC_CODERANGE_7BIT && code > 127)
2135  cr = ENC_CODERANGE_VALID;
2136  ENC_CODERANGE_SET(str1, cr);
2137  }
2138  return str1;
2139 }
2140 
2141 /*
2142  * call-seq:
2143  * str.prepend(other_str) -> str
2144  *
2145  * Prepend---Prepend the given string to <i>str</i>.
2146  *
2147  * a = "world"
2148  * a.prepend("hello ") #=> "hello world"
2149  * a #=> "hello world"
2150  */
2151 
2152 static VALUE
2154 {
2155  StringValue(str2);
2156  StringValue(str);
2157  rb_str_update(str, 0L, 0L, str2);
2158  return str;
2159 }
2160 
2161 st_index_t
2163 {
2164  int e = ENCODING_GET(str);
2165  if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
2166  e = 0;
2167  }
2168  return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
2169 }
2170 
2171 int
2173 {
2174  long len;
2175 
2176  if (!rb_str_comparable(str1, str2)) return 1;
2177  if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
2178  memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
2179  return 0;
2180  }
2181  return 1;
2182 }
2183 
2184 /*
2185  * call-seq:
2186  * str.hash -> fixnum
2187  *
2188  * Return a hash based on the string's length and content.
2189  */
2190 
2191 static VALUE
2193 {
2194  st_index_t hval = rb_str_hash(str);
2195  return INT2FIX(hval);
2196 }
2197 
2198 #define lesser(a,b) (((a)>(b))?(b):(a))
2199 
2200 int
2202 {
2203  int idx1, idx2;
2204  int rc1, rc2;
2205 
2206  if (RSTRING_LEN(str1) == 0) return TRUE;
2207  if (RSTRING_LEN(str2) == 0) return TRUE;
2208  idx1 = ENCODING_GET(str1);
2209  idx2 = ENCODING_GET(str2);
2210  if (idx1 == idx2) return TRUE;
2211  rc1 = rb_enc_str_coderange(str1);
2212  rc2 = rb_enc_str_coderange(str2);
2213  if (rc1 == ENC_CODERANGE_7BIT) {
2214  if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
2216  return TRUE;
2217  }
2218  if (rc2 == ENC_CODERANGE_7BIT) {
2220  return TRUE;
2221  }
2222  return FALSE;
2223 }
2224 
2225 int
2227 {
2228  long len1, len2;
2229  const char *ptr1, *ptr2;
2230  int retval;
2231 
2232  if (str1 == str2) return 0;
2233  RSTRING_GETMEM(str1, ptr1, len1);
2234  RSTRING_GETMEM(str2, ptr2, len2);
2235  if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
2236  if (len1 == len2) {
2237  if (!rb_str_comparable(str1, str2)) {
2238  if (ENCODING_GET(str1) > ENCODING_GET(str2))
2239  return 1;
2240  return -1;
2241  }
2242  return 0;
2243  }
2244  if (len1 > len2) return 1;
2245  return -1;
2246  }
2247  if (retval > 0) return 1;
2248  return -1;
2249 }
2250 
2251 /* expect tail call optimization */
2252 static VALUE
2253 str_eql(const VALUE str1, const VALUE str2)
2254 {
2255  const long len = RSTRING_LEN(str1);
2256  const char *ptr1, *ptr2;
2257 
2258  if (len != RSTRING_LEN(str2)) return Qfalse;
2259  if (!rb_str_comparable(str1, str2)) return Qfalse;
2260  if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
2261  return Qtrue;
2262  if (memcmp(ptr1, ptr2, len) == 0)
2263  return Qtrue;
2264  return Qfalse;
2265 }
2266 /*
2267  * call-seq:
2268  * str == obj -> true or false
2269  *
2270  * Equality---If <i>obj</i> is not a <code>String</code>, returns
2271  * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
2272  * <code><=></code> <i>obj</i> returns zero.
2273  */
2274 
2275 VALUE
2277 {
2278  if (str1 == str2) return Qtrue;
2279  if (TYPE(str2) != T_STRING) {
2280  if (!rb_respond_to(str2, rb_intern("to_str"))) {
2281  return Qfalse;
2282  }
2283  return rb_equal(str2, str1);
2284  }
2285  return str_eql(str1, str2);
2286 }
2287 
2288 /*
2289  * call-seq:
2290  * str.eql?(other) -> true or false
2291  *
2292  * Two strings are equal if they have the same length and content.
2293  */
2294 
2295 static VALUE
2297 {
2298  if (str1 == str2) return Qtrue;
2299  if (TYPE(str2) != T_STRING) return Qfalse;
2300  return str_eql(str1, str2);
2301 }
2302 
2303 /*
2304  * call-seq:
2305  * str <=> other_str -> -1, 0, +1 or nil
2306  *
2307  * Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if
2308  * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than
2309  * <i>str</i>. If the strings are of different lengths, and the strings are
2310  * equal when compared up to the shortest length, then the longer string is
2311  * considered greater than the shorter one. In older versions of Ruby, setting
2312  * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
2313  * in favor of using <code>String#casecmp</code>.
2314  *
2315  * <code><=></code> is the basis for the methods <code><</code>,
2316  * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
2317  * included from module <code>Comparable</code>. The method
2318  * <code>String#==</code> does not use <code>Comparable#==</code>.
2319  *
2320  * "abcdef" <=> "abcde" #=> 1
2321  * "abcdef" <=> "abcdef" #=> 0
2322  * "abcdef" <=> "abcdefg" #=> -1
2323  * "abcdef" <=> "ABCDEF" #=> 1
2324  */
2325 
2326 static VALUE
2328 {
2329  long result;
2330 
2331  if (TYPE(str2) != T_STRING) {
2332  if (!rb_respond_to(str2, rb_intern("to_str"))) {
2333  return Qnil;
2334  }
2335  else if (!rb_respond_to(str2, rb_intern("<=>"))) {
2336  return Qnil;
2337  }
2338  else {
2339  VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
2340 
2341  if (NIL_P(tmp)) return Qnil;
2342  if (!FIXNUM_P(tmp)) {
2343  return rb_funcall(LONG2FIX(0), '-', 1, tmp);
2344  }
2345  result = -FIX2LONG(tmp);
2346  }
2347  }
2348  else {
2349  result = rb_str_cmp(str1, str2);
2350  }
2351  return LONG2NUM(result);
2352 }
2353 
2354 /*
2355  * call-seq:
2356  * str.casecmp(other_str) -> -1, 0, +1 or nil
2357  *
2358  * Case-insensitive version of <code>String#<=></code>.
2359  *
2360  * "abcdef".casecmp("abcde") #=> 1
2361  * "aBcDeF".casecmp("abcdef") #=> 0
2362  * "abcdef".casecmp("abcdefg") #=> -1
2363  * "abcdef".casecmp("ABCDEF") #=> 0
2364  */
2365 
2366 static VALUE
2368 {
2369  long len;
2370  rb_encoding *enc;
2371  char *p1, *p1end, *p2, *p2end;
2372 
2373  StringValue(str2);
2374  enc = rb_enc_compatible(str1, str2);
2375  if (!enc) {
2376  return Qnil;
2377  }
2378 
2379  p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2380  p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2381  if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
2382  while (p1 < p1end && p2 < p2end) {
2383  if (*p1 != *p2) {
2384  unsigned int c1 = TOUPPER(*p1 & 0xff);
2385  unsigned int c2 = TOUPPER(*p2 & 0xff);
2386  if (c1 != c2)
2387  return INT2FIX(c1 < c2 ? -1 : 1);
2388  }
2389  p1++;
2390  p2++;
2391  }
2392  }
2393  else {
2394  while (p1 < p1end && p2 < p2end) {
2395  int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
2396  int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
2397 
2398  if (0 <= c1 && 0 <= c2) {
2399  c1 = TOUPPER(c1);
2400  c2 = TOUPPER(c2);
2401  if (c1 != c2)
2402  return INT2FIX(c1 < c2 ? -1 : 1);
2403  }
2404  else {
2405  int r;
2406  l1 = rb_enc_mbclen(p1, p1end, enc);
2407  l2 = rb_enc_mbclen(p2, p2end, enc);
2408  len = l1 < l2 ? l1 : l2;
2409  r = memcmp(p1, p2, len);
2410  if (r != 0)
2411  return INT2FIX(r < 0 ? -1 : 1);
2412  if (l1 != l2)
2413  return INT2FIX(l1 < l2 ? -1 : 1);
2414  }
2415  p1 += l1;
2416  p2 += l2;
2417  }
2418  }
2419  if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2420  if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2421  return INT2FIX(-1);
2422 }
2423 
2424 static long
2425 rb_str_index(VALUE str, VALUE sub, long offset)
2426 {
2427  long pos;
2428  char *s, *sptr, *e;
2429  long len, slen;
2430  rb_encoding *enc;
2431 
2432  enc = rb_enc_check(str, sub);
2433  if (is_broken_string(sub)) {
2434  return -1;
2435  }
2436  len = str_strlen(str, enc);
2437  slen = str_strlen(sub, enc);
2438  if (offset < 0) {
2439  offset += len;
2440  if (offset < 0) return -1;
2441  }
2442  if (len - offset < slen) return -1;
2443  s = RSTRING_PTR(str);
2444  e = s + RSTRING_LEN(str);
2445  if (offset) {
2446  offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2447  s += offset;
2448  }
2449  if (slen == 0) return offset;
2450  /* need proceed one character at a time */
2451  sptr = RSTRING_PTR(sub);
2452  slen = RSTRING_LEN(sub);
2453  len = RSTRING_LEN(str) - offset;
2454  for (;;) {
2455  char *t;
2456  pos = rb_memsearch(sptr, slen, s, len, enc);
2457  if (pos < 0) return pos;
2458  t = rb_enc_right_char_head(s, s+pos, e, enc);
2459  if (t == s + pos) break;
2460  if ((len -= t - s) <= 0) return -1;
2461  offset += t - s;
2462  s = t;
2463  }
2464  return pos + offset;
2465 }
2466 
2467 
2468 /*
2469  * call-seq:
2470  * str.index(substring [, offset]) -> fixnum or nil
2471  * str.index(regexp [, offset]) -> fixnum or nil
2472  *
2473  * Returns the index of the first occurrence of the given <i>substring</i> or
2474  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2475  * found. If the second parameter is present, it specifies the position in the
2476  * string to begin the search.
2477  *
2478  * "hello".index('e') #=> 1
2479  * "hello".index('lo') #=> 3
2480  * "hello".index('a') #=> nil
2481  * "hello".index(?e) #=> 1
2482  * "hello".index(/[aeiou]/, -3) #=> 4
2483  */
2484 
2485 static VALUE
2487 {
2488  VALUE sub;
2489  VALUE initpos;
2490  long pos;
2491 
2492  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2493  pos = NUM2LONG(initpos);
2494  }
2495  else {
2496  pos = 0;
2497  }
2498  if (pos < 0) {
2499  pos += str_strlen(str, STR_ENC_GET(str));
2500  if (pos < 0) {
2501  if (TYPE(sub) == T_REGEXP) {
2503  }
2504  return Qnil;
2505  }
2506  }
2507 
2508  switch (TYPE(sub)) {
2509  case T_REGEXP:
2510  if (pos > str_strlen(str, STR_ENC_GET(str)))
2511  return Qnil;
2512  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2513  rb_enc_check(str, sub), single_byte_optimizable(str));
2514 
2515  pos = rb_reg_search(sub, str, pos, 0);
2516  pos = rb_str_sublen(str, pos);
2517  break;
2518 
2519  default: {
2520  VALUE tmp;
2521 
2522  tmp = rb_check_string_type(sub);
2523  if (NIL_P(tmp)) {
2524  rb_raise(rb_eTypeError, "type mismatch: %s given",
2525  rb_obj_classname(sub));
2526  }
2527  sub = tmp;
2528  }
2529  /* fall through */
2530  case T_STRING:
2531  pos = rb_str_index(str, sub, pos);
2532  pos = rb_str_sublen(str, pos);
2533  break;
2534  }
2535 
2536  if (pos == -1) return Qnil;
2537  return LONG2NUM(pos);
2538 }
2539 
2540 static long
2541 rb_str_rindex(VALUE str, VALUE sub, long pos)
2542 {
2543  long len, slen;
2544  char *s, *sbeg, *e, *t;
2545  rb_encoding *enc;
2546  int singlebyte = single_byte_optimizable(str);
2547 
2548  enc = rb_enc_check(str, sub);
2549  if (is_broken_string(sub)) {
2550  return -1;
2551  }
2552  len = str_strlen(str, enc);
2553  slen = str_strlen(sub, enc);
2554  /* substring longer than string */
2555  if (len < slen) return -1;
2556  if (len - pos < slen) {
2557  pos = len - slen;
2558  }
2559  if (len == 0) {
2560  return pos;
2561  }
2562  sbeg = RSTRING_PTR(str);
2563  e = RSTRING_END(str);
2564  t = RSTRING_PTR(sub);
2565  slen = RSTRING_LEN(sub);
2566  s = str_nth(sbeg, e, pos, enc, singlebyte);
2567  while (s) {
2568  if (memcmp(s, t, slen) == 0) {
2569  return pos;
2570  }
2571  if (pos == 0) break;
2572  pos--;
2573  s = rb_enc_prev_char(sbeg, s, e, enc);
2574  }
2575  return -1;
2576 }
2577 
2578 
2579 /*
2580  * call-seq:
2581  * str.rindex(substring [, fixnum]) -> fixnum or nil
2582  * str.rindex(regexp [, fixnum]) -> fixnum or nil
2583  *
2584  * Returns the index of the last occurrence of the given <i>substring</i> or
2585  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2586  * found. If the second parameter is present, it specifies the position in the
2587  * string to end the search---characters beyond this point will not be
2588  * considered.
2589  *
2590  * "hello".rindex('e') #=> 1
2591  * "hello".rindex('l') #=> 3
2592  * "hello".rindex('a') #=> nil
2593  * "hello".rindex(?e) #=> 1
2594  * "hello".rindex(/[aeiou]/, -2) #=> 1
2595  */
2596 
2597 static VALUE
2599 {
2600  VALUE sub;
2601  VALUE vpos;
2602  rb_encoding *enc = STR_ENC_GET(str);
2603  long pos, len = str_strlen(str, enc);
2604 
2605  if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2606  pos = NUM2LONG(vpos);
2607  if (pos < 0) {
2608  pos += len;
2609  if (pos < 0) {
2610  if (TYPE(sub) == T_REGEXP) {
2612  }
2613  return Qnil;
2614  }
2615  }
2616  if (pos > len) pos = len;
2617  }
2618  else {
2619  pos = len;
2620  }
2621 
2622  switch (TYPE(sub)) {
2623  case T_REGEXP:
2624  /* enc = rb_get_check(str, sub); */
2625  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2627 
2628  if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2629  pos = rb_reg_search(sub, str, pos, 1);
2630  pos = rb_str_sublen(str, pos);
2631  }
2632  if (pos >= 0) return LONG2NUM(pos);
2633  break;
2634 
2635  default: {
2636  VALUE tmp;
2637 
2638  tmp = rb_check_string_type(sub);
2639  if (NIL_P(tmp)) {
2640  rb_raise(rb_eTypeError, "type mismatch: %s given",
2641  rb_obj_classname(sub));
2642  }
2643  sub = tmp;
2644  }
2645  /* fall through */
2646  case T_STRING:
2647  pos = rb_str_rindex(str, sub, pos);
2648  if (pos >= 0) return LONG2NUM(pos);
2649  break;
2650  }
2651  return Qnil;
2652 }
2653 
2654 /*
2655  * call-seq:
2656  * str =~ obj -> fixnum or nil
2657  *
2658  * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2659  * against <i>str</i>,and returns the position the match starts, or
2660  * <code>nil</code> if there is no match. Otherwise, invokes
2661  * <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2662  * <code>=~</code> in <code>Object</code> returns <code>nil</code>.
2663  *
2664  * "cat o' 9 tails" =~ /\d/ #=> 7
2665  * "cat o' 9 tails" =~ 9 #=> nil
2666  */
2667 
2668 static VALUE
2670 {
2671  switch (TYPE(y)) {
2672  case T_STRING:
2673  rb_raise(rb_eTypeError, "type mismatch: String given");
2674 
2675  case T_REGEXP:
2676  return rb_reg_match(y, x);
2677 
2678  default:
2679  return rb_funcall(y, rb_intern("=~"), 1, x);
2680  }
2681 }
2682 
2683 
2684 static VALUE get_pat(VALUE, int);
2685 
2686 
2687 /*
2688  * call-seq:
2689  * str.match(pattern) -> matchdata or nil
2690  * str.match(pattern, pos) -> matchdata or nil
2691  *
2692  * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2693  * then invokes its <code>match</code> method on <i>str</i>. If the second
2694  * parameter is present, it specifies the position in the string to begin the
2695  * search.
2696  *
2697  * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l">
2698  * 'hello'.match('(.)\1')[0] #=> "ll"
2699  * 'hello'.match(/(.)\1/)[0] #=> "ll"
2700  * 'hello'.match('xx') #=> nil
2701  *
2702  * If a block is given, invoke the block with MatchData if match succeed, so
2703  * that you can write
2704  *
2705  * str.match(pat) {|m| ...}
2706  *
2707  * instead of
2708  *
2709  * if m = str.match(pat)
2710  * ...
2711  * end
2712  *
2713  * The return value is a value from block execution in this case.
2714  */
2715 
2716 static VALUE
2718 {
2719  VALUE re, result;
2720  if (argc < 1)
2721  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
2722  re = argv[0];
2723  argv[0] = str;
2724  result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2725  if (!NIL_P(result) && rb_block_given_p()) {
2726  return rb_yield(result);
2727  }
2728  return result;
2729 }
2730 
2735 };
2736 
2737 static enum neighbor_char
2738 enc_succ_char(char *p, long len, rb_encoding *enc)
2739 {
2740  long i;
2741  int l;
2742  while (1) {
2743  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2744  p[i] = '\0';
2745  if (i < 0)
2746  return NEIGHBOR_WRAPPED;
2747  ++((unsigned char*)p)[i];
2748  l = rb_enc_precise_mbclen(p, p+len, enc);
2749  if (MBCLEN_CHARFOUND_P(l)) {
2750  l = MBCLEN_CHARFOUND_LEN(l);
2751  if (l == len) {
2752  return NEIGHBOR_FOUND;
2753  }
2754  else {
2755  memset(p+l, 0xff, len-l);
2756  }
2757  }
2758  if (MBCLEN_INVALID_P(l) && i < len-1) {
2759  long len2;
2760  int l2;
2761  for (len2 = len-1; 0 < len2; len2--) {
2762  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2763  if (!MBCLEN_INVALID_P(l2))
2764  break;
2765  }
2766  memset(p+len2+1, 0xff, len-(len2+1));
2767  }
2768  }
2769 }
2770 
2771 static enum neighbor_char
2772 enc_pred_char(char *p, long len, rb_encoding *enc)
2773 {
2774  long i;
2775  int l;
2776  while (1) {
2777  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2778  p[i] = '\xff';
2779  if (i < 0)
2780  return NEIGHBOR_WRAPPED;
2781  --((unsigned char*)p)[i];
2782  l = rb_enc_precise_mbclen(p, p+len, enc);
2783  if (MBCLEN_CHARFOUND_P(l)) {
2784  l = MBCLEN_CHARFOUND_LEN(l);
2785  if (l == len) {
2786  return NEIGHBOR_FOUND;
2787  }
2788  else {
2789  memset(p+l, 0, len-l);
2790  }
2791  }
2792  if (MBCLEN_INVALID_P(l) && i < len-1) {
2793  long len2;
2794  int l2;
2795  for (len2 = len-1; 0 < len2; len2--) {
2796  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2797  if (!MBCLEN_INVALID_P(l2))
2798  break;
2799  }
2800  memset(p+len2+1, 0, len-(len2+1));
2801  }
2802  }
2803 }
2804 
2805 /*
2806  overwrite +p+ by succeeding letter in +enc+ and returns
2807  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2808  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2809  assuming each ranges are successive, and mbclen
2810  never change in each ranges.
2811  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2812  character.
2813  */
2814 static enum neighbor_char
2815 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
2816 {
2817  enum neighbor_char ret;
2818  unsigned int c;
2819  int ctype;
2820  int range;
2821  char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2822 
2823  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2824  if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2825  ctype = ONIGENC_CTYPE_DIGIT;
2826  else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2827  ctype = ONIGENC_CTYPE_ALPHA;
2828  else
2829  return NEIGHBOR_NOT_CHAR;
2830 
2831  MEMCPY(save, p, char, len);
2832  ret = enc_succ_char(p, len, enc);
2833  if (ret == NEIGHBOR_FOUND) {
2834  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2835  if (rb_enc_isctype(c, ctype, enc))
2836  return NEIGHBOR_FOUND;
2837  }
2838  MEMCPY(p, save, char, len);
2839  range = 1;
2840  while (1) {
2841  MEMCPY(save, p, char, len);
2842  ret = enc_pred_char(p, len, enc);
2843  if (ret == NEIGHBOR_FOUND) {
2844  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2845  if (!rb_enc_isctype(c, ctype, enc)) {
2846  MEMCPY(p, save, char, len);
2847  break;
2848  }
2849  }
2850  else {
2851  MEMCPY(p, save, char, len);
2852  break;
2853  }
2854  range++;
2855  }
2856  if (range == 1) {
2857  return NEIGHBOR_NOT_CHAR;
2858  }
2859 
2860  if (ctype != ONIGENC_CTYPE_DIGIT) {
2861  MEMCPY(carry, p, char, len);
2862  return NEIGHBOR_WRAPPED;
2863  }
2864 
2865  MEMCPY(carry, p, char, len);
2866  enc_succ_char(carry, len, enc);
2867  return NEIGHBOR_WRAPPED;
2868 }
2869 
2870 
2871 /*
2872  * call-seq:
2873  * str.succ -> new_str
2874  * str.next -> new_str
2875  *
2876  * Returns the successor to <i>str</i>. The successor is calculated by
2877  * incrementing characters starting from the rightmost alphanumeric (or
2878  * the rightmost character if there are no alphanumerics) in the
2879  * string. Incrementing a digit always results in another digit, and
2880  * incrementing a letter results in another letter of the same case.
2881  * Incrementing nonalphanumerics uses the underlying character set's
2882  * collating sequence.
2883  *
2884  * If the increment generates a ``carry,'' the character to the left of
2885  * it is incremented. This process repeats until there is no carry,
2886  * adding an additional character if necessary.
2887  *
2888  * "abcd".succ #=> "abce"
2889  * "THX1138".succ #=> "THX1139"
2890  * "<<koala>>".succ #=> "<<koalb>>"
2891  * "1999zzz".succ #=> "2000aaa"
2892  * "ZZZ9999".succ #=> "AAAA0000"
2893  * "***".succ #=> "**+"
2894  */
2895 
2896 VALUE
2898 {
2899  rb_encoding *enc;
2900  VALUE str;
2901  char *sbeg, *s, *e, *last_alnum = 0;
2902  int c = -1;
2903  long l;
2904  char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2905  long carry_pos = 0, carry_len = 1;
2906  enum neighbor_char neighbor = NEIGHBOR_FOUND;
2907 
2908  str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2909  rb_enc_cr_str_copy_for_substr(str, orig);
2910  OBJ_INFECT(str, orig);
2911  if (RSTRING_LEN(str) == 0) return str;
2912 
2913  enc = STR_ENC_GET(orig);
2914  sbeg = RSTRING_PTR(str);
2915  s = e = sbeg + RSTRING_LEN(str);
2916 
2917  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
2918  if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
2919  if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
2920  ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
2921  s = last_alnum;
2922  break;
2923  }
2924  }
2925  if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2926  neighbor = enc_succ_alnum_char(s, l, enc, carry);
2927  switch (neighbor) {
2928  case NEIGHBOR_NOT_CHAR:
2929  continue;
2930  case NEIGHBOR_FOUND:
2931  return str;
2932  case NEIGHBOR_WRAPPED:
2933  last_alnum = s;
2934  break;
2935  }
2936  c = 1;
2937  carry_pos = s - sbeg;
2938  carry_len = l;
2939  }
2940  if (c == -1) { /* str contains no alnum */
2941  s = e;
2942  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
2943  enum neighbor_char neighbor;
2944  if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2945  neighbor = enc_succ_char(s, l, enc);
2946  if (neighbor == NEIGHBOR_FOUND)
2947  return str;
2948  if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
2949  /* wrapped to \0...\0. search next valid char. */
2950  enc_succ_char(s, l, enc);
2951  }
2952  if (!rb_enc_asciicompat(enc)) {
2953  MEMCPY(carry, s, char, l);
2954  carry_len = l;
2955  }
2956  carry_pos = s - sbeg;
2957  }
2958  }
2959  RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
2960  s = RSTRING_PTR(str) + carry_pos;
2961  memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
2962  memmove(s, carry, carry_len);
2963  STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
2964  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2965  rb_enc_str_coderange(str);
2966  return str;
2967 }
2968 
2969 
2970 /*
2971  * call-seq:
2972  * str.succ! -> str
2973  * str.next! -> str
2974  *
2975  * Equivalent to <code>String#succ</code>, but modifies the receiver in
2976  * place.
2977  */
2978 
2979 static VALUE
2981 {
2983 
2984  return str;
2985 }
2986 
2987 
2988 /*
2989  * call-seq:
2990  * str.upto(other_str, exclusive=false) {|s| block } -> str
2991  * str.upto(other_str, exclusive=false) -> an_enumerator
2992  *
2993  * Iterates through successive values, starting at <i>str</i> and
2994  * ending at <i>other_str</i> inclusive, passing each value in turn to
2995  * the block. The <code>String#succ</code> method is used to generate
2996  * each value. If optional second argument exclusive is omitted or is false,
2997  * the last value will be included; otherwise it will be excluded.
2998  *
2999  * If no block is given, an enumerator is returned instead.
3000  *
3001  * "a8".upto("b6") {|s| print s, ' ' }
3002  * for s in "a8".."b6"
3003  * print s, ' '
3004  * end
3005  *
3006  * <em>produces:</em>
3007  *
3008  * a8 a9 b0 b1 b2 b3 b4 b5 b6
3009  * a8 a9 b0 b1 b2 b3 b4 b5 b6
3010  *
3011  * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
3012  * both are recognized as decimal numbers. In addition, the width of
3013  * string (e.g. leading zeros) is handled appropriately.
3014  *
3015  * "9".upto("11").to_a #=> ["9", "10", "11"]
3016  * "25".upto("5").to_a #=> []
3017  * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"]
3018  */
3019 
3020 static VALUE
3022 {
3023  VALUE end, exclusive;
3024  VALUE current, after_end;
3025  ID succ;
3026  int n, excl, ascii;
3027  rb_encoding *enc;
3028 
3029  rb_scan_args(argc, argv, "11", &end, &exclusive);
3030  RETURN_ENUMERATOR(beg, argc, argv);
3031  excl = RTEST(exclusive);
3032  CONST_ID(succ, "succ");
3033  StringValue(end);
3034  enc = rb_enc_check(beg, end);
3035  ascii = (is_ascii_string(beg) && is_ascii_string(end));
3036  /* single character */
3037  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
3038  char c = RSTRING_PTR(beg)[0];
3039  char e = RSTRING_PTR(end)[0];
3040 
3041  if (c > e || (excl && c == e)) return beg;
3042  for (;;) {
3043  rb_yield(rb_enc_str_new(&c, 1, enc));
3044  if (!excl && c == e) break;
3045  c++;
3046  if (excl && c == e) break;
3047  }
3048  return beg;
3049  }
3050  /* both edges are all digits */
3051  if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
3052  char *s, *send;
3053  VALUE b, e;
3054  int width;
3055 
3056  s = RSTRING_PTR(beg); send = RSTRING_END(beg);
3057  width = rb_long2int(send - s);
3058  while (s < send) {
3059  if (!ISDIGIT(*s)) goto no_digits;
3060  s++;
3061  }
3062  s = RSTRING_PTR(end); send = RSTRING_END(end);
3063  while (s < send) {
3064  if (!ISDIGIT(*s)) goto no_digits;
3065  s++;
3066  }
3067  b = rb_str_to_inum(beg, 10, FALSE);
3068  e = rb_str_to_inum(end, 10, FALSE);
3069  if (FIXNUM_P(b) && FIXNUM_P(e)) {
3070  long bi = FIX2LONG(b);
3071  long ei = FIX2LONG(e);
3072  rb_encoding *usascii = rb_usascii_encoding();
3073 
3074  while (bi <= ei) {
3075  if (excl && bi == ei) break;
3076  rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
3077  bi++;
3078  }
3079  }
3080  else {
3081  ID op = excl ? '<' : rb_intern("<=");
3082  VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
3083 
3084  args[0] = INT2FIX(width);
3085  while (rb_funcall(b, op, 1, e)) {
3086  args[1] = b;
3087  rb_yield(rb_str_format(numberof(args), args, fmt));
3088  b = rb_funcall(b, succ, 0, 0);
3089  }
3090  }
3091  return beg;
3092  }
3093  /* normal case */
3094  no_digits:
3095  n = rb_str_cmp(beg, end);
3096  if (n > 0 || (excl && n == 0)) return beg;
3097 
3098  after_end = rb_funcall(end, succ, 0, 0);
3099  current = rb_str_dup(beg);
3100  while (!rb_str_equal(current, after_end)) {
3101  VALUE next = Qnil;
3102  if (excl || !rb_str_equal(current, end))
3103  next = rb_funcall(current, succ, 0, 0);
3104  rb_yield(current);
3105  if (NIL_P(next)) break;
3106  current = next;
3107  StringValue(current);
3108  if (excl && rb_str_equal(current, end)) break;
3109  if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
3110  break;
3111  }
3112 
3113  return beg;
3114 }
3115 
3116 static VALUE
3117 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
3118 {
3119  if (rb_reg_search(re, str, 0, 0) >= 0) {
3121  int nth = rb_reg_backref_number(match, backref);
3122  return rb_reg_nth_match(nth, match);
3123  }
3124  return Qnil;
3125 }
3126 
3127 static VALUE
3129 {
3130  long idx;
3131 
3132  switch (TYPE(indx)) {
3133  case T_FIXNUM:
3134  idx = FIX2LONG(indx);
3135 
3136  num_index:
3137  str = rb_str_substr(str, idx, 1);
3138  if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
3139  return str;
3140 
3141  case T_REGEXP:
3142  return rb_str_subpat(str, indx, INT2FIX(0));
3143 
3144  case T_STRING:
3145  if (rb_str_index(str, indx, 0) != -1)
3146  return rb_str_dup(indx);
3147  return Qnil;
3148 
3149  default:
3150  /* check if indx is Range */
3151  {
3152  long beg, len;
3153  VALUE tmp;
3154 
3155  len = str_strlen(str, STR_ENC_GET(str));
3156  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
3157  case Qfalse:
3158  break;
3159  case Qnil:
3160  return Qnil;
3161  default:
3162  tmp = rb_str_substr(str, beg, len);
3163  return tmp;
3164  }
3165  }
3166  idx = NUM2LONG(indx);
3167  goto num_index;
3168  }
3169  return Qnil; /* not reached */
3170 }
3171 
3172 
3173 /*
3174  * call-seq:
3175  * str[fixnum] -> new_str or nil
3176  * str[fixnum, fixnum] -> new_str or nil
3177  * str[range] -> new_str or nil
3178  * str[regexp] -> new_str or nil
3179  * str[regexp, fixnum] -> new_str or nil
3180  * str[other_str] -> new_str or nil
3181  * str.slice(fixnum) -> new_str or nil
3182  * str.slice(fixnum, fixnum) -> new_str or nil
3183  * str.slice(range) -> new_str or nil
3184  * str.slice(regexp) -> new_str or nil
3185  * str.slice(regexp, fixnum) -> new_str or nil
3186  * str.slice(regexp, capname) -> new_str or nil
3187  * str.slice(other_str) -> new_str or nil
3188  *
3189  * Element Reference---If passed a single <code>Fixnum</code>, returns a
3190  * substring of one character at that position. If passed two <code>Fixnum</code>
3191  * objects, returns a substring starting at the offset given by the first, and
3192  * with a length given by the second. If passed a range, its beginning and end
3193  * are interpreted as offsets delimiting the substring to be returned. In all
3194  * three cases, if an offset is negative, it is counted from the end of <i>str</i>.
3195  * Returns <code>nil</code> if the initial offset falls outside the string or
3196  * the length is negative.
3197  *
3198  * If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
3199  * returned. If a numeric or name parameter follows the regular expression, that
3200  * component of the <code>MatchData</code> is returned instead. If a
3201  * <code>String</code> is given, that string is returned if it occurs in
3202  * <i>str</i>. In both cases, <code>nil</code> is returned if there is no
3203  * match.
3204  *
3205  * a = "hello there"
3206  * a[1] #=> "e"
3207  * a[2, 3] #=> "llo"
3208  * a[2..3] #=> "ll"
3209  * a[-3, 2] #=> "er"
3210  * a[7..-2] #=> "her"
3211  * a[-4..-2] #=> "her"
3212  * a[-2..-4] #=> ""
3213  * a[12..-1] #=> nil
3214  * a[/[aeiou](.)\1/] #=> "ell"
3215  * a[/[aeiou](.)\1/, 0] #=> "ell"
3216  * a[/[aeiou](.)\1/, 1] #=> "l"
3217  * a[/[aeiou](.)\1/, 2] #=> nil
3218  * a["lo"] #=> "lo"
3219  * a["bye"] #=> nil
3220  */
3221 
3222 static VALUE
3224 {
3225  if (argc == 2) {
3226  if (TYPE(argv[0]) == T_REGEXP) {
3227  return rb_str_subpat(str, argv[0], argv[1]);
3228  }
3229  return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
3230  }
3231  if (argc != 1) {
3232  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3233  }
3234  return rb_str_aref(str, argv[0]);
3235 }
3236 
3237 VALUE
3238 rb_str_drop_bytes(VALUE str, long len)
3239 {
3240  char *ptr = RSTRING_PTR(str);
3241  long olen = RSTRING_LEN(str), nlen;
3242 
3243  str_modifiable(str);
3244  if (len > olen) len = olen;
3245  nlen = olen - len;
3246  if (nlen <= RSTRING_EMBED_LEN_MAX) {
3247  char *oldptr = ptr;
3248  int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
3249  STR_SET_EMBED(str);
3250  STR_SET_EMBED_LEN(str, nlen);
3251  ptr = RSTRING(str)->as.ary;
3252  memmove(ptr, oldptr + len, nlen);
3253  if (fl == STR_NOEMBED) xfree(oldptr);
3254  }
3255  else {
3256  if (!STR_SHARED_P(str)) rb_str_new4(str);
3257  ptr = RSTRING(str)->as.heap.ptr += len;
3258  RSTRING(str)->as.heap.len = nlen;
3259  }
3260  ptr[nlen] = 0;
3261  ENC_CODERANGE_CLEAR(str);
3262  return str;
3263 }
3264 
3265 static void
3266 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
3267 {
3268  if (beg == 0 && RSTRING_LEN(val) == 0) {
3269  rb_str_drop_bytes(str, len);
3270  OBJ_INFECT(str, val);
3271  return;
3272  }
3273 
3274  rb_str_modify(str);
3275  if (len < RSTRING_LEN(val)) {
3276  /* expand string */
3277  RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
3278  }
3279 
3280  if (RSTRING_LEN(val) != len) {
3281  memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
3282  RSTRING_PTR(str) + beg + len,
3283  RSTRING_LEN(str) - (beg + len));
3284  }
3285  if (RSTRING_LEN(val) < beg && len < 0) {
3286  MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
3287  }
3288  if (RSTRING_LEN(val) > 0) {
3289  memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
3290  }
3291  STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
3292  if (RSTRING_PTR(str)) {
3293  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3294  }
3295  OBJ_INFECT(str, val);
3296 }
3297 
3298 static void
3299 rb_str_splice(VALUE str, long beg, long len, VALUE val)
3300 {
3301  long slen;
3302  char *p, *e;
3303  rb_encoding *enc;
3304  int singlebyte = single_byte_optimizable(str);
3305  int cr;
3306 
3307  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
3308 
3309  StringValue(val);
3310  enc = rb_enc_check(str, val);
3311  slen = str_strlen(str, enc);
3312 
3313  if (slen < beg) {
3314  out_of_range:
3315  rb_raise(rb_eIndexError, "index %ld out of string", beg);
3316  }
3317  if (beg < 0) {
3318  if (-beg > slen) {
3319  goto out_of_range;
3320  }
3321  beg += slen;
3322  }
3323  if (slen < len || slen < beg + len) {
3324  len = slen - beg;
3325  }
3326  str_modify_keep_cr(str);
3327  p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
3328  if (!p) p = RSTRING_END(str);
3329  e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
3330  if (!e) e = RSTRING_END(str);
3331  /* error check */
3332  beg = p - RSTRING_PTR(str); /* physical position */
3333  len = e - p; /* physical length */
3334  rb_str_splice_0(str, beg, len, val);
3335  rb_enc_associate(str, enc);
3337  if (cr != ENC_CODERANGE_BROKEN)
3338  ENC_CODERANGE_SET(str, cr);
3339 }
3340 
3341 void
3342 rb_str_update(VALUE str, long beg, long len, VALUE val)
3343 {
3344  rb_str_splice(str, beg, len, val);
3345 }
3346 
3347 static void
3348 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
3349 {
3350  int nth;
3351  VALUE match;
3352  long start, end, len;
3353  rb_encoding *enc;
3354  struct re_registers *regs;
3355 
3356  if (rb_reg_search(re, str, 0, 0) < 0) {
3357  rb_raise(rb_eIndexError, "regexp not matched");
3358  }
3359  match = rb_backref_get();
3360  nth = rb_reg_backref_number(match, backref);
3361  regs = RMATCH_REGS(match);
3362  if (nth >= regs->num_regs) {
3363  out_of_range:
3364  rb_raise(rb_eIndexError, "index %d out of regexp", nth);
3365  }
3366  if (nth < 0) {
3367  if (-nth >= regs->num_regs) {
3368  goto out_of_range;
3369  }
3370  nth += regs->num_regs;
3371  }
3372 
3373  start = BEG(nth);
3374  if (start == -1) {
3375  rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
3376  }
3377  end = END(nth);
3378  len = end - start;
3379  StringValue(val);
3380  enc = rb_enc_check(str, val);
3381  rb_str_splice_0(str, start, len, val);
3382  rb_enc_associate(str, enc);
3383 }
3384 
3385 static VALUE
3386 rb_str_aset(VALUE str, VALUE indx, VALUE val)
3387 {
3388  long idx, beg;
3389 
3390  switch (TYPE(indx)) {
3391  case T_FIXNUM:
3392  idx = FIX2LONG(indx);
3393  num_index:
3394  rb_str_splice(str, idx, 1, val);
3395  return val;
3396 
3397  case T_REGEXP:
3398  rb_str_subpat_set(str, indx, INT2FIX(0), val);
3399  return val;
3400 
3401  case T_STRING:
3402  beg = rb_str_index(str, indx, 0);
3403  if (beg < 0) {
3404  rb_raise(rb_eIndexError, "string not matched");
3405  }
3406  beg = rb_str_sublen(str, beg);
3407  rb_str_splice(str, beg, str_strlen(indx, 0), val);
3408  return val;
3409 
3410  default:
3411  /* check if indx is Range */
3412  {
3413  long beg, len;
3414  if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
3415  rb_str_splice(str, beg, len, val);
3416  return val;
3417  }
3418  }
3419  idx = NUM2LONG(indx);
3420  goto num_index;
3421  }
3422 }
3423 
3424 /*
3425  * call-seq:
3426  * str[fixnum] = new_str
3427  * str[fixnum, fixnum] = new_str
3428  * str[range] = aString
3429  * str[regexp] = new_str
3430  * str[regexp, fixnum] = new_str
3431  * str[regexp, name] = new_str
3432  * str[other_str] = new_str
3433  *
3434  * Element Assignment---Replaces some or all of the content of <i>str</i>. The
3435  * portion of the string affected is determined using the same criteria as
3436  * <code>String#[]</code>. If the replacement string is not the same length as
3437  * the text it is replacing, the string will be adjusted accordingly. If the
3438  * regular expression or string is used as the index doesn't match a position
3439  * in the string, <code>IndexError</code> is raised. If the regular expression
3440  * form is used, the optional second <code>Fixnum</code> allows you to specify
3441  * which portion of the match to replace (effectively using the
3442  * <code>MatchData</code> indexing rules. The forms that take a
3443  * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
3444  * out of range; the <code>Range</code> form will raise a
3445  * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
3446  * forms will silently ignore the assignment.
3447  */
3448 
3449 static VALUE
3451 {
3452  if (argc == 3) {
3453  if (TYPE(argv[0]) == T_REGEXP) {
3454  rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
3455  }
3456  else {
3457  rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
3458  }
3459  return argv[2];
3460  }
3461  if (argc != 2) {
3462  rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
3463  }
3464  return rb_str_aset(str, argv[0], argv[1]);
3465 }
3466 
3467 /*
3468  * call-seq:
3469  * str.insert(index, other_str) -> str
3470  *
3471  * Inserts <i>other_str</i> before the character at the given
3472  * <i>index</i>, modifying <i>str</i>. Negative indices count from the
3473  * end of the string, and insert <em>after</em> the given character.
3474  * The intent is insert <i>aString</i> so that it starts at the given
3475  * <i>index</i>.
3476  *
3477  * "abcd".insert(0, 'X') #=> "Xabcd"
3478  * "abcd".insert(3, 'X') #=> "abcXd"
3479  * "abcd".insert(4, 'X') #=> "abcdX"
3480  * "abcd".insert(-3, 'X') #=> "abXcd"
3481  * "abcd".insert(-1, 'X') #=> "abcdX"
3482  */
3483 
3484 static VALUE
3486 {
3487  long pos = NUM2LONG(idx);
3488 
3489  if (pos == -1) {
3490  return rb_str_append(str, str2);
3491  }
3492  else if (pos < 0) {
3493  pos++;
3494  }
3495  rb_str_splice(str, pos, 0, str2);
3496  return str;
3497 }
3498 
3499 
3500 /*
3501  * call-seq:
3502  * str.slice!(fixnum) -> fixnum or nil
3503  * str.slice!(fixnum, fixnum) -> new_str or nil
3504  * str.slice!(range) -> new_str or nil
3505  * str.slice!(regexp) -> new_str or nil
3506  * str.slice!(other_str) -> new_str or nil
3507  *
3508  * Deletes the specified portion from <i>str</i>, and returns the portion
3509  * deleted.
3510  *
3511  * string = "this is a string"
3512  * string.slice!(2) #=> "i"
3513  * string.slice!(3..6) #=> " is "
3514  * string.slice!(/s.*t/) #=> "sa st"
3515  * string.slice!("r") #=> "r"
3516  * string #=> "thing"
3517  */
3518 
3519 static VALUE
3521 {
3522  VALUE result;
3523  VALUE buf[3];
3524  int i;
3525 
3526  if (argc < 1 || 2 < argc) {
3527  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3528  }
3529  for (i=0; i<argc; i++) {
3530  buf[i] = argv[i];
3531  }
3532  str_modify_keep_cr(str);
3533  result = rb_str_aref_m(argc, buf, str);
3534  if (!NIL_P(result)) {
3535  buf[i] = rb_str_new(0,0);
3536  rb_str_aset_m(argc+1, buf, str);
3537  }
3538  return result;
3539 }
3540 
3541 static VALUE
3542 get_pat(VALUE pat, int quote)
3543 {
3544  VALUE val;
3545 
3546  switch (TYPE(pat)) {
3547  case T_REGEXP:
3548  return pat;
3549 
3550  case T_STRING:
3551  break;
3552 
3553  default:
3554  val = rb_check_string_type(pat);
3555  if (NIL_P(val)) {
3556  Check_Type(pat, T_REGEXP);
3557  }
3558  pat = val;
3559  }
3560 
3561  if (quote) {
3562  pat = rb_reg_quote(pat);
3563  }
3564 
3565  return rb_reg_regcomp(pat);
3566 }
3567 
3568 
3569 /*
3570  * call-seq:
3571  * str.sub!(pattern, replacement) -> str or nil
3572  * str.sub!(pattern) {|match| block } -> str or nil
3573  *
3574  * Performs the substitutions of <code>String#sub</code> in place,
3575  * returning <i>str</i>, or <code>nil</code> if no substitutions were
3576  * performed.
3577  */
3578 
3579 static VALUE
3581 {
3582  VALUE pat, repl, hash = Qnil;
3583  int iter = 0;
3584  int tainted = 0;
3585  int untrusted = 0;
3586  long plen;
3587 
3588  if (argc == 1 && rb_block_given_p()) {
3589  iter = 1;
3590  }
3591  else if (argc == 2) {
3592  repl = argv[1];
3593  hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3594  if (NIL_P(hash)) {
3595  StringValue(repl);
3596  }
3597  if (OBJ_TAINTED(repl)) tainted = 1;
3598  if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3599  }
3600  else {
3601  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3602  }
3603 
3604  pat = get_pat(argv[0], 1);
3605  str_modifiable(str);
3606  if (rb_reg_search(pat, str, 0, 0) >= 0) {
3607  rb_encoding *enc;
3608  int cr = ENC_CODERANGE(str);
3610  struct re_registers *regs = RMATCH_REGS(match);
3611  long beg0 = BEG(0);
3612  long end0 = END(0);
3613  char *p, *rp;
3614  long len, rlen;
3615 
3616  if (iter || !NIL_P(hash)) {
3617  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3618 
3619  if (iter) {
3620  repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3621  }
3622  else {
3623  repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3624  repl = rb_obj_as_string(repl);
3625  }
3626  str_mod_check(str, p, len);
3627  rb_check_frozen(str);
3628  }
3629  else {
3630  repl = rb_reg_regsub(repl, str, regs, pat);
3631  }
3632  enc = rb_enc_compatible(str, repl);
3633  if (!enc) {
3634  rb_encoding *str_enc = STR_ENC_GET(str);
3635  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
3636  if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
3637  coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
3638  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3639  rb_enc_name(str_enc),
3640  rb_enc_name(STR_ENC_GET(repl)));
3641  }
3642  enc = STR_ENC_GET(repl);
3643  }
3644  rb_str_modify(str);
3645  rb_enc_associate(str, enc);
3646  if (OBJ_TAINTED(repl)) tainted = 1;
3647  if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3648  if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3649  int cr2 = ENC_CODERANGE(repl);
3650  if (cr2 == ENC_CODERANGE_BROKEN ||
3651  (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
3652  cr = ENC_CODERANGE_UNKNOWN;
3653  else
3654  cr = cr2;
3655  }
3656  plen = end0 - beg0;
3657  rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
3658  len = RSTRING_LEN(str);
3659  if (rlen > plen) {
3660  RESIZE_CAPA(str, len + rlen - plen);
3661  }
3662  p = RSTRING_PTR(str);
3663  if (rlen != plen) {
3664  memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
3665  }
3666  memcpy(p + beg0, rp, rlen);
3667  len += rlen - plen;
3668  STR_SET_LEN(str, len);
3669  RSTRING_PTR(str)[len] = '\0';
3670  ENC_CODERANGE_SET(str, cr);
3671  if (tainted) OBJ_TAINT(str);
3672  if (untrusted) OBJ_UNTRUST(str);
3673 
3674  return str;
3675  }
3676  return Qnil;
3677 }
3678 
3679 
3680 /*
3681  * call-seq:
3682  * str.sub(pattern, replacement) -> new_str
3683  * str.sub(pattern, hash) -> new_str
3684  * str.sub(pattern) {|match| block } -> new_str
3685  *
3686  * Returns a copy of <i>str</i> with the <em>first</em> occurrence of
3687  * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
3688  * typically a <code>Regexp</code>; if given as a <code>String</code>, any
3689  * regular expression metacharacters it contains will be interpreted
3690  * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
3691  * instead of a digit.
3692  *
3693  * If <i>replacement</i> is a <code>String</code> it will be substituted for
3694  * the matched text. It may contain back-references to the pattern's capture
3695  * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
3696  * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
3697  * double-quoted string, both back-references must be preceded by an
3698  * additional backslash. However, within <i>replacement</i> the special match
3699  * variables, such as <code>&$</code>, will not refer to the current match.
3700  *
3701  * If the second argument is a <code>Hash</code>, and the matched text is one
3702  * of its keys, the corresponding value is the replacement string.
3703  *
3704  * In the block form, the current match string is passed in as a parameter,
3705  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3706  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3707  * returned by the block will be substituted for the match on each call.
3708  *
3709  * The result inherits any tainting in the original string or any supplied
3710  * replacement string.
3711  *
3712  * "hello".sub(/[aeiou]/, '*') #=> "h*llo"
3713  * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
3714  * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello"
3715  * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
3716  * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
3717  * #=> "Is /bin/bash your preferred shell?"
3718  */
3719 
3720 static VALUE
3722 {
3723  str = rb_str_dup(str);
3724  rb_str_sub_bang(argc, argv, str);
3725  return str;
3726 }
3727 
3728 static VALUE
3729 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3730 {
3731  VALUE pat, val, repl, match, dest, hash = Qnil;
3732  struct re_registers *regs;
3733  long beg, n;
3734  long beg0, end0;
3735  long offset, blen, slen, len, last;
3736  int iter = 0;
3737  char *sp, *cp;
3738  int tainted = 0;
3739  rb_encoding *str_enc;
3740 
3741  switch (argc) {
3742  case 1:
3743  RETURN_ENUMERATOR(str, argc, argv);
3744  iter = 1;
3745  break;
3746  case 2:
3747  repl = argv[1];
3748  hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3749  if (NIL_P(hash)) {
3750  StringValue(repl);
3751  }
3752  if (OBJ_TAINTED(repl)) tainted = 1;
3753  break;
3754  default:
3755  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
3756  }
3757 
3758  pat = get_pat(argv[0], 1);
3759  beg = rb_reg_search(pat, str, 0, 0);
3760  if (beg < 0) {
3761  if (bang) return Qnil; /* no match, no substitution */
3762  return rb_str_dup(str);
3763  }
3764 
3765  offset = 0;
3766  n = 0;
3767  blen = RSTRING_LEN(str) + 30; /* len + margin */
3768  dest = rb_str_buf_new(blen);
3769  sp = RSTRING_PTR(str);
3770  slen = RSTRING_LEN(str);
3771  cp = sp;
3772  str_enc = STR_ENC_GET(str);
3773  rb_enc_associate(dest, str_enc);
3775 
3776  do {
3777  n++;
3778  match = rb_backref_get();
3779  regs = RMATCH_REGS(match);
3780  beg0 = BEG(0);
3781  end0 = END(0);
3782  if (iter || !NIL_P(hash)) {
3783  if (iter) {
3784  val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3785  }
3786  else {
3787  val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3788  val = rb_obj_as_string(val);
3789  }
3790  str_mod_check(str, sp, slen);
3791  if (val == dest) { /* paranoid check [ruby-dev:24827] */
3792  rb_raise(rb_eRuntimeError, "block should not cheat");
3793  }
3794  }
3795  else {
3796  val = rb_reg_regsub(repl, str, regs, pat);
3797  }
3798 
3799  if (OBJ_TAINTED(val)) tainted = 1;
3800 
3801  len = beg - offset; /* copy pre-match substr */
3802  if (len) {
3803  rb_enc_str_buf_cat(dest, cp, len, str_enc);
3804  }
3805 
3806  rb_str_buf_append(dest, val);
3807 
3808  last = offset;
3809  offset = end0;
3810  if (beg0 == end0) {
3811  /*
3812  * Always consume at least one character of the input string
3813  * in order to prevent infinite loops.
3814  */
3815  if (RSTRING_LEN(str) <= end0) break;
3816  len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3817  rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3818  offset = end0 + len;
3819  }
3820  cp = RSTRING_PTR(str) + offset;
3821  if (offset > RSTRING_LEN(str)) break;
3822  beg = rb_reg_search(pat, str, offset, 0);
3823  } while (beg >= 0);
3824  if (RSTRING_LEN(str) > offset) {
3825  rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3826  }
3827  rb_reg_search(pat, str, last, 0);
3828  if (bang) {
3829  rb_str_shared_replace(str, dest);
3830  }
3831  else {
3832  RBASIC(dest)->klass = rb_obj_class(str);
3833  OBJ_INFECT(dest, str);
3834  str = dest;
3835  }
3836 
3837  if (tainted) OBJ_TAINT(str);
3838  return str;
3839 }
3840 
3841 
3842 /*
3843  * call-seq:
3844  * str.gsub!(pattern, replacement) -> str or nil
3845  * str.gsub!(pattern) {|match| block } -> str or nil
3846  * str.gsub!(pattern) -> an_enumerator
3847  *
3848  * Performs the substitutions of <code>String#gsub</code> in place, returning
3849  * <i>str</i>, or <code>nil</code> if no substitutions were performed.
3850  * If no block and no <i>replacement</i> is given, an enumerator is returned instead.
3851  */
3852 
3853 static VALUE
3855 {
3856  str_modify_keep_cr(str);
3857  return str_gsub(argc, argv, str, 1);
3858 }
3859 
3860 
3861 /*
3862  * call-seq:
3863  * str.gsub(pattern, replacement) -> new_str
3864  * str.gsub(pattern, hash) -> new_str
3865  * str.gsub(pattern) {|match| block } -> new_str
3866  * str.gsub(pattern) -> enumerator
3867  *
3868  * Returns a copy of <i>str</i> with the <em>all</em> occurrences of
3869  * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
3870  * typically a <code>Regexp</code>; if given as a <code>String</code>, any
3871  * regular expression metacharacters it contains will be interpreted
3872  * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
3873  * instead of a digit.
3874  *
3875  * If <i>replacement</i> is a <code>String</code> it will be substituted for
3876  * the matched text. It may contain back-references to the pattern's capture
3877  * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
3878  * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
3879  * double-quoted string, both back-references must be preceded by an
3880  * additional backslash. However, within <i>replacement</i> the special match
3881  * variables, such as <code>&$</code>, will not refer to the current match.
3882  *
3883  * If the second argument is a <code>Hash</code>, and the matched text is one
3884  * of its keys, the corresponding value is the replacement string.
3885  *
3886  * In the block form, the current match string is passed in as a parameter,
3887  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3888  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3889  * returned by the block will be substituted for the match on each call.
3890  *
3891  * The result inherits any tainting in the original string or any supplied
3892  * replacement string.
3893  *
3894  * When neither a block nor a second argument is supplied, an
3895  * <code>Enumerator</code> is returned.
3896  *
3897  * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*"
3898  * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>"
3899  * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 "
3900  * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}"
3901  * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*"
3902  */
3903 
3904 static VALUE
3906 {
3907  return str_gsub(argc, argv, str, 0);
3908 }
3909 
3910 
3911 /*
3912  * call-seq:
3913  * str.replace(other_str) -> str
3914  *
3915  * Replaces the contents and taintedness of <i>str</i> with the corresponding
3916  * values in <i>other_str</i>.
3917  *
3918  * s = "hello" #=> "hello"
3919  * s.replace "world" #=> "world"
3920  */
3921 
3922 VALUE
3924 {
3925  str_modifiable(str);
3926  if (str == str2) return str;
3927 
3928  StringValue(str2);
3929  str_discard(str);
3930  return str_replace(str, str2);
3931 }
3932 
3933 /*
3934  * call-seq:
3935  * string.clear -> string
3936  *
3937  * Makes string empty.
3938  *
3939  * a = "abcde"
3940  * a.clear #=> ""
3941  */
3942 
3943 static VALUE
3945 {
3946  str_discard(str);
3947  STR_SET_EMBED(str);
3948  STR_SET_EMBED_LEN(str, 0);
3949  RSTRING_PTR(str)[0] = 0;
3950  if (rb_enc_asciicompat(STR_ENC_GET(str)))
3952  else
3954  return str;
3955 }
3956 
3957 /*
3958  * call-seq:
3959  * string.chr -> string
3960  *
3961  * Returns a one-character string at the beginning of the string.
3962  *
3963  * a = "abcde"
3964  * a.chr #=> "a"
3965  */
3966 
3967 static VALUE
3969 {
3970  return rb_str_substr(str, 0, 1);
3971 }
3972 
3973 /*
3974  * call-seq:
3975  * str.getbyte(index) -> 0 .. 255
3976  *
3977  * returns the <i>index</i>th byte as an integer.
3978  */
3979 static VALUE
3981 {
3982  long pos = NUM2LONG(index);
3983 
3984  if (pos < 0)
3985  pos += RSTRING_LEN(str);
3986  if (pos < 0 || RSTRING_LEN(str) <= pos)
3987  return Qnil;
3988 
3989  return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
3990 }
3991 
3992 /*
3993  * call-seq:
3994  * str.setbyte(index, int) -> int
3995  *
3996  * modifies the <i>index</i>th byte as <i>int</i>.
3997  */
3998 static VALUE
3999 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
4000 {
4001  long pos = NUM2LONG(index);
4002  int byte = NUM2INT(value);
4003 
4004  rb_str_modify(str);
4005 
4006  if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
4007  rb_raise(rb_eIndexError, "index %ld out of string", pos);
4008  if (pos < 0)
4009  pos += RSTRING_LEN(str);
4010 
4011  RSTRING_PTR(str)[pos] = byte;
4012 
4013  return value;
4014 }
4015 
4016 static VALUE
4017 str_byte_substr(VALUE str, long beg, long len)
4018 {
4019  char *p, *s = RSTRING_PTR(str);
4020  long n = RSTRING_LEN(str);
4021  VALUE str2;
4022 
4023  if (beg > n || len < 0) return Qnil;
4024  if (beg < 0) {
4025  beg += n;
4026  if (beg < 0) return Qnil;
4027  }
4028  if (beg + len > n)
4029  len = n - beg;
4030  if (len <= 0) {
4031  len = 0;
4032  p = 0;
4033  }
4034  else
4035  p = s + beg;
4036 
4037  if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
4038  str2 = rb_str_new4(str);
4039  str2 = str_new3(rb_obj_class(str2), str2);
4040  RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
4041  RSTRING(str2)->as.heap.len = len;
4042  }
4043  else {
4044  str2 = rb_str_new5(str, p, len);
4045  }
4046 
4047  str_enc_copy(str2, str);
4048 
4049  if (RSTRING_LEN(str2) == 0) {
4050  if (!rb_enc_asciicompat(STR_ENC_GET(str)))
4052  else
4054  }
4055  else {
4056  switch (ENC_CODERANGE(str)) {
4057  case ENC_CODERANGE_7BIT:
4059  break;
4060  default:
4062  break;
4063  }
4064  }
4065 
4066  OBJ_INFECT(str2, str);
4067 
4068  return str2;
4069 }
4070 
4071 static VALUE
4073 {
4074  long idx;
4075  switch (TYPE(indx)) {
4076  case T_FIXNUM:
4077  idx = FIX2LONG(indx);
4078 
4079  num_index:
4080  str = str_byte_substr(str, idx, 1);
4081  if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
4082  return str;
4083 
4084  default:
4085  /* check if indx is Range */
4086  {
4087  long beg, len = RSTRING_LEN(str);
4088 
4089  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4090  case Qfalse:
4091  break;
4092  case Qnil:
4093  return Qnil;
4094  default:
4095  return str_byte_substr(str, beg, len);
4096  }
4097  }
4098  idx = NUM2LONG(indx);
4099  goto num_index;
4100  }
4101  return Qnil; /* not reached */
4102 }
4103 
4104 /*
4105  * call-seq:
4106  * str.byteslice(fixnum) -> new_str or nil
4107  * str.byteslice(fixnum, fixnum) -> new_str or nil
4108  * str.byteslice(range) -> new_str or nil
4109  *
4110  * Byte Reference---If passed a single <code>Fixnum</code>, returns a
4111  * substring of one byte at that position. If passed two <code>Fixnum</code>
4112  * objects, returns a substring starting at the offset given by the first, and
4113  * a length given by the second. If given a <code>Range</code>, a substring containing
4114  * bytes at offsets given by the range is returned. In all three cases, if
4115  * an offset is negative, it is counted from the end of <i>str</i>. Returns
4116  * <code>nil</code> if the initial offset falls outside the string, the length
4117  * is negative, or the beginning of the range is greater than the end.
4118  * The encoding of the resulted string keeps original encoding.
4119  *
4120  * "hello".byteslice(1) #=> "e"
4121  * "hello".byteslice(-1) #=> "o"
4122  * "hello".byteslice(1, 2) #=> "el"
4123  * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
4124  * "\x03\u3042\xff".byteslice(1..3) #=> "\u3942"
4125  */
4126 
4127 static VALUE
4129 {
4130  if (argc == 2) {
4131  return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
4132  }
4133  if (argc != 1) {
4134  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
4135  }
4136  return str_byte_aref(str, argv[0]);
4137 }
4138 
4139 /*
4140  * call-seq:
4141  * str.reverse -> new_str
4142  *
4143  * Returns a new string with the characters from <i>str</i> in reverse order.
4144  *
4145  * "stressed".reverse #=> "desserts"
4146  */
4147 
4148 static VALUE
4150 {
4151  rb_encoding *enc;
4152  VALUE rev;
4153  char *s, *e, *p;
4154  int single = 1;
4155 
4156  if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
4157  enc = STR_ENC_GET(str);
4158  rev = rb_str_new5(str, 0, RSTRING_LEN(str));
4159  s = RSTRING_PTR(str); e = RSTRING_END(str);
4160  p = RSTRING_END(rev);
4161 
4162  if (RSTRING_LEN(str) > 1) {
4163  if (single_byte_optimizable(str)) {
4164  while (s < e) {
4165  *--p = *s++;
4166  }
4167  }
4168  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
4169  while (s < e) {
4170  int clen = rb_enc_fast_mbclen(s, e, enc);
4171 
4172  if (clen > 1 || (*s & 0x80)) single = 0;
4173  p -= clen;
4174  memcpy(p, s, clen);
4175  s += clen;
4176  }
4177  }
4178  else {
4179  while (s < e) {
4180  int clen = rb_enc_mbclen(s, e, enc);
4181 
4182  if (clen > 1 || (*s & 0x80)) single = 0;
4183  p -= clen;
4184  memcpy(p, s, clen);
4185  s += clen;
4186  }
4187  }
4188  }
4189  STR_SET_LEN(rev, RSTRING_LEN(str));
4190  OBJ_INFECT(rev, str);
4191  if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
4192  if (single) {
4194  }
4195  else {
4197  }
4198  }
4200 
4201  return rev;
4202 }
4203 
4204 
4205 /*
4206  * call-seq:
4207  * str.reverse! -> str
4208  *
4209  * Reverses <i>str</i> in place.
4210  */
4211 
4212 static VALUE
4214 {
4215  if (RSTRING_LEN(str) > 1) {
4216  if (single_byte_optimizable(str)) {
4217  char *s, *e, c;
4218 
4219  str_modify_keep_cr(str);
4220  s = RSTRING_PTR(str);
4221  e = RSTRING_END(str) - 1;
4222  while (s < e) {
4223  c = *s;
4224  *s++ = *e;
4225  *e-- = c;
4226  }
4227  }
4228  else {
4230  }
4231  }
4232  else {
4233  str_modify_keep_cr(str);
4234  }
4235  return str;
4236 }
4237 
4238 
4239 /*
4240  * call-seq:
4241  * str.include? other_str -> true or false
4242  *
4243  * Returns <code>true</code> if <i>str</i> contains the given string or
4244  * character.
4245  *
4246  * "hello".include? "lo" #=> true
4247  * "hello".include? "ol" #=> false
4248  * "hello".include? ?h #=> true
4249  */
4250 
4251 static VALUE
4253 {
4254  long i;
4255 
4256  StringValue(arg);
4257  i = rb_str_index(str, arg, 0);
4258 
4259  if (i == -1) return Qfalse;
4260  return Qtrue;
4261 }
4262 
4263 
4264 /*
4265  * call-seq:
4266  * str.to_i(base=10) -> integer
4267  *
4268  * Returns the result of interpreting leading characters in <i>str</i> as an
4269  * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
4270  * end of a valid number are ignored. If there is not a valid number at the
4271  * start of <i>str</i>, <code>0</code> is returned. This method never raises an
4272  * exception when <i>base</i> is valid.
4273  *
4274  * "12345".to_i #=> 12345
4275  * "99 red balloons".to_i #=> 99
4276  * "0a".to_i #=> 0
4277  * "0a".to_i(16) #=> 10
4278  * "hello".to_i #=> 0
4279  * "1100101".to_i(2) #=> 101
4280  * "1100101".to_i(8) #=> 294977
4281  * "1100101".to_i(10) #=> 1100101
4282  * "1100101".to_i(16) #=> 17826049
4283  */
4284 
4285 static VALUE
4287 {
4288  int base;
4289 
4290  if (argc == 0) base = 10;
4291  else {
4292  VALUE b;
4293 
4294  rb_scan_args(argc, argv, "01", &b);
4295  base = NUM2INT(b);
4296  }
4297  if (base < 0) {
4298  rb_raise(rb_eArgError, "invalid radix %d", base);
4299  }
4300  return rb_str_to_inum(str, base, FALSE);
4301 }
4302 
4303 
4304 /*
4305  * call-seq:
4306  * str.to_f -> float
4307  *
4308  * Returns the result of interpreting leading characters in <i>str</i> as a
4309  * floating point number. Extraneous characters past the end of a valid number
4310  * are ignored. If there is not a valid number at the start of <i>str</i>,
4311  * <code>0.0</code> is returned. This method never raises an exception.
4312  *
4313  * "123.45e1".to_f #=> 1234.5
4314  * "45.67 degrees".to_f #=> 45.67
4315  * "thx1138".to_f #=> 0.0
4316  */
4317 
4318 static VALUE
4320 {
4321  return DBL2NUM(rb_str_to_dbl(str, FALSE));
4322 }
4323 
4324 
4325 /*
4326  * call-seq:
4327  * str.to_s -> str
4328  * str.to_str -> str
4329  *
4330  * Returns the receiver.
4331  */
4332 
4333 static VALUE
4335 {
4336  if (rb_obj_class(str) != rb_cString) {
4337  return str_duplicate(rb_cString, str);
4338  }
4339  return str;
4340 }
4341 
4342 #if 0
4343 static void
4344 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
4345 {
4346  char s[RUBY_MAX_CHAR_LEN];
4347  int n = rb_enc_codelen(c, enc);
4348 
4349  rb_enc_mbcput(c, s, enc);
4350  rb_enc_str_buf_cat(str, s, n, enc);
4351 }
4352 #endif
4353 
4354 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
4355 
4356 int
4357 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
4358 {
4359  char buf[CHAR_ESC_LEN + 1];
4360  int l;
4361 
4362 #if SIZEOF_INT > 4
4363  c &= 0xffffffff;
4364 #endif
4365  if (unicode_p) {
4366  if (c < 0x7F && ISPRINT(c)) {
4367  snprintf(buf, CHAR_ESC_LEN, "%c", c);
4368  }
4369  else if (c < 0x10000) {
4370  snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
4371  }
4372  else {
4373  snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
4374  }
4375  }
4376  else {
4377  if (c < 0x100) {
4378  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
4379  }
4380  else {
4381  snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
4382  }
4383  }
4384  l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
4385  rb_str_buf_cat(result, buf, l);
4386  return l;
4387 }
4388 
4389 /*
4390  * call-seq:
4391  * str.inspect -> string
4392  *
4393  * Returns a printable version of _str_, surrounded by quote marks,
4394  * with special characters escaped.
4395  *
4396  * str = "hello"
4397  * str[3] = "\b"
4398  * str.inspect #=> "\"hel\\bo\""
4399  */
4400 
4401 VALUE
4403 {
4404  rb_encoding *enc = STR_ENC_GET(str);
4405  const char *p, *pend, *prev;
4406  char buf[CHAR_ESC_LEN + 1];
4409  int unicode_p = rb_enc_unicode_p(enc);
4410  int asciicompat = rb_enc_asciicompat(enc);
4411  static rb_encoding *utf16, *utf32;
4412 
4413  if (!utf16) utf16 = rb_enc_find("UTF-16");
4414  if (!utf32) utf32 = rb_enc_find("UTF-32");
4415  if (resenc == NULL) resenc = rb_default_external_encoding();
4416  if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
4417  rb_enc_associate(result, resenc);
4418  str_buf_cat2(result, "\"");
4419 
4420  p = RSTRING_PTR(str); pend = RSTRING_END(str);
4421  prev = p;
4422  if (enc == utf16) {
4423  const unsigned char *q = (const unsigned char *)p;
4424  if (q[0] == 0xFE && q[1] == 0xFF)
4425  enc = rb_enc_find("UTF-16BE");
4426  else if (q[0] == 0xFF && q[1] == 0xFE)
4427  enc = rb_enc_find("UTF-16LE");
4428  else
4429  unicode_p = 0;
4430  }
4431  else if (enc == utf32) {
4432  const unsigned char *q = (const unsigned char *)p;
4433  if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
4434  enc = rb_enc_find("UTF-32BE");
4435  else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
4436  enc = rb_enc_find("UTF-32LE");
4437  else
4438  unicode_p = 0;
4439  }
4440  while (p < pend) {
4441  unsigned int c, cc;
4442  int n;
4443 
4444  n = rb_enc_precise_mbclen(p, pend, enc);
4445  if (!MBCLEN_CHARFOUND_P(n)) {
4446  if (p > prev) str_buf_cat(result, prev, p - prev);
4447  n = rb_enc_mbminlen(enc);
4448  if (pend < p + n)
4449  n = (int)(pend - p);
4450  while (n--) {
4451  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
4452  str_buf_cat(result, buf, strlen(buf));
4453  prev = ++p;
4454  }
4455  continue;
4456  }
4457  n = MBCLEN_CHARFOUND_LEN(n);
4458  c = rb_enc_mbc_to_codepoint(p, pend, enc);
4459  p += n;
4460  if ((asciicompat || unicode_p) &&
4461  (c == '"'|| c == '\\' ||
4462  (c == '#' &&
4463  p < pend &&
4465  (cc = rb_enc_codepoint(p,pend,enc),
4466  (cc == '$' || cc == '@' || cc == '{'))))) {
4467  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4468  str_buf_cat2(result, "\\");
4469  if (asciicompat || enc == resenc) {
4470  prev = p - n;
4471  continue;
4472  }
4473  }
4474  switch (c) {
4475  case '\n': cc = 'n'; break;
4476  case '\r': cc = 'r'; break;
4477  case '\t': cc = 't'; break;
4478  case '\f': cc = 'f'; break;
4479  case '\013': cc = 'v'; break;
4480  case '\010': cc = 'b'; break;
4481  case '\007': cc = 'a'; break;
4482  case 033: cc = 'e'; break;
4483  default: cc = 0; break;
4484  }
4485  if (cc) {
4486  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4487  buf[0] = '\\';
4488  buf[1] = (char)cc;
4489  str_buf_cat(result, buf, 2);
4490  prev = p;
4491  continue;
4492  }
4493  if ((enc == resenc && rb_enc_isprint(c, enc)) ||
4494  (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
4495  continue;
4496  }
4497  else {
4498  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
4499  rb_str_buf_cat_escaped_char(result, c, unicode_p);
4500  prev = p;
4501  continue;
4502  }
4503  }
4504  if (p > prev) str_buf_cat(result, prev, p - prev);
4505  str_buf_cat2(result, "\"");
4506 
4507  OBJ_INFECT(result, str);
4508  return result;
4509 }
4510 
4511 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
4512 
4513 /*
4514  * call-seq:
4515  * str.dump -> new_str
4516  *
4517  * Produces a version of <i>str</i> with all nonprinting characters replaced by
4518  * <code>\nnn</code> notation and all special characters escaped.
4519  */
4520 
4521 VALUE
4523 {
4524  rb_encoding *enc = rb_enc_get(str);
4525  long len;
4526  const char *p, *pend;
4527  char *q, *qend;
4528  VALUE result;
4529  int u8 = (enc == rb_utf8_encoding());
4530 
4531  len = 2; /* "" */
4532  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4533  while (p < pend) {
4534  unsigned char c = *p++;
4535  switch (c) {
4536  case '"': case '\\':
4537  case '\n': case '\r':
4538  case '\t': case '\f':
4539  case '\013': case '\010': case '\007': case '\033':
4540  len += 2;
4541  break;
4542 
4543  case '#':
4544  len += IS_EVSTR(p, pend) ? 2 : 1;
4545  break;
4546 
4547  default:
4548  if (ISPRINT(c)) {
4549  len++;
4550  }
4551  else {
4552  if (u8) { /* \u{NN} */
4553  int n = rb_enc_precise_mbclen(p-1, pend, enc);
4554  if (MBCLEN_CHARFOUND_P(n-1)) {
4555  unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4556  while (cc >>= 4) len++;
4557  len += 5;
4558  p += MBCLEN_CHARFOUND_LEN(n)-1;
4559  break;
4560  }
4561  }
4562  len += 4; /* \xNN */
4563  }
4564  break;
4565  }
4566  }
4567  if (!rb_enc_asciicompat(enc)) {
4568  len += 19; /* ".force_encoding('')" */
4569  len += strlen(enc->name);
4570  }
4571 
4572  result = rb_str_new5(str, 0, len);
4573  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
4574  q = RSTRING_PTR(result); qend = q + len + 1;
4575 
4576  *q++ = '"';
4577  while (p < pend) {
4578  unsigned char c = *p++;
4579 
4580  if (c == '"' || c == '\\') {
4581  *q++ = '\\';
4582  *q++ = c;
4583  }
4584  else if (c == '#') {
4585  if (IS_EVSTR(p, pend)) *q++ = '\\';
4586  *q++ = '#';
4587  }
4588  else if (c == '\n') {
4589  *q++ = '\\';
4590  *q++ = 'n';
4591  }
4592  else if (c == '\r') {
4593  *q++ = '\\';
4594  *q++ = 'r';
4595  }
4596  else if (c == '\t') {
4597  *q++ = '\\';
4598  *q++ = 't';
4599  }
4600  else if (c == '\f') {
4601  *q++ = '\\';
4602  *q++ = 'f';
4603  }
4604  else if (c == '\013') {
4605  *q++ = '\\';
4606  *q++ = 'v';
4607  }
4608  else if (c == '\010') {
4609  *q++ = '\\';
4610  *q++ = 'b';
4611  }
4612  else if (c == '\007') {
4613  *q++ = '\\';
4614  *q++ = 'a';
4615  }
4616  else if (c == '\033') {
4617  *q++ = '\\';
4618  *q++ = 'e';
4619  }
4620  else if (ISPRINT(c)) {
4621  *q++ = c;
4622  }
4623  else {
4624  *q++ = '\\';
4625  if (u8) {
4626  int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
4627  if (MBCLEN_CHARFOUND_P(n)) {
4628  int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
4629  p += n;
4630  snprintf(q, qend-q, "u{%x}", cc);
4631  q += strlen(q);
4632  continue;
4633  }
4634  }
4635  snprintf(q, qend-q, "x%02X", c);
4636  q += 3;
4637  }
4638  }
4639  *q++ = '"';
4640  *q = '\0';
4641  if (!rb_enc_asciicompat(enc)) {
4642  snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
4643  enc = rb_ascii8bit_encoding();
4644  }
4645  OBJ_INFECT(result, str);
4646  /* result from dump is ASCII */
4647  rb_enc_associate(result, enc);
4649  return result;
4650 }
4651 
4652 
4653 static void
4655 {
4656  if (rb_enc_dummy_p(enc)) {
4657  rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
4658  rb_enc_name(enc));
4659  }
4660 }
4661 
4662 /*
4663  * call-seq:
4664  * str.upcase! -> str or nil
4665  *
4666  * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
4667  * were made.
4668  * Note: case replacement is effective only in ASCII region.
4669  */
4670 
4671 static VALUE
4673 {
4674  rb_encoding *enc;
4675  char *s, *send;
4676  int modify = 0;
4677  int n;
4678 
4679  str_modify_keep_cr(str);
4680  enc = STR_ENC_GET(str);
4682  s = RSTRING_PTR(str); send = RSTRING_END(str);
4683  if (single_byte_optimizable(str)) {
4684  while (s < send) {
4685  unsigned int c = *(unsigned char*)s;
4686 
4687  if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4688  *s = 'A' + (c - 'a');
4689  modify = 1;
4690  }
4691  s++;
4692  }
4693  }
4694  else {
4695  int ascompat = rb_enc_asciicompat(enc);
4696 
4697  while (s < send) {
4698  unsigned int c;
4699 
4700  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4701  if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
4702  *s = 'A' + (c - 'a');
4703  modify = 1;
4704  }
4705  s++;
4706  }
4707  else {
4708  c = rb_enc_codepoint_len(s, send, &n, enc);
4709  if (rb_enc_islower(c, enc)) {
4710  /* assuming toupper returns codepoint with same size */
4711  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4712  modify = 1;
4713  }
4714  s += n;
4715  }
4716  }
4717  }
4718 
4719  if (modify) return str;
4720  return Qnil;
4721 }
4722 
4723 
4724 /*
4725  * call-seq:
4726  * str.upcase -> new_str
4727  *
4728  * Returns a copy of <i>str</i> with all lowercase letters replaced with their
4729  * uppercase counterparts. The operation is locale insensitive---only
4730  * characters ``a'' to ``z'' are affected.
4731  * Note: case replacement is effective only in ASCII region.
4732  *
4733  * "hEllO".upcase #=> "HELLO"
4734  */
4735 
4736 static VALUE
4738 {
4739  str = rb_str_dup(str);
4740  rb_str_upcase_bang(str);
4741  return str;
4742 }
4743 
4744 
4745 /*
4746  * call-seq:
4747  * str.downcase! -> str or nil
4748  *
4749  * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4750  * changes were made.
4751  * Note: case replacement is effective only in ASCII region.
4752  */
4753 
4754 static VALUE
4756 {
4757  rb_encoding *enc;
4758  char *s, *send;
4759  int modify = 0;
4760 
4761  str_modify_keep_cr(str);
4762  enc = STR_ENC_GET(str);
4764  s = RSTRING_PTR(str); send = RSTRING_END(str);
4765  if (single_byte_optimizable(str)) {
4766  while (s < send) {
4767  unsigned int c = *(unsigned char*)s;
4768 
4769  if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4770  *s = 'a' + (c - 'A');
4771  modify = 1;
4772  }
4773  s++;
4774  }
4775  }
4776  else {
4777  int ascompat = rb_enc_asciicompat(enc);
4778 
4779  while (s < send) {
4780  unsigned int c;
4781  int n;
4782 
4783  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
4784  if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
4785  *s = 'a' + (c - 'A');
4786  modify = 1;
4787  }
4788  s++;
4789  }
4790  else {
4791  c = rb_enc_codepoint_len(s, send, &n, enc);
4792  if (rb_enc_isupper(c, enc)) {
4793  /* assuming toupper returns codepoint with same size */
4794  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4795  modify = 1;
4796  }
4797  s += n;
4798  }
4799  }
4800  }
4801 
4802  if (modify) return str;
4803  return Qnil;
4804 }
4805 
4806 
4807 /*
4808  * call-seq:
4809  * str.downcase -> new_str
4810  *
4811  * Returns a copy of <i>str</i> with all uppercase letters replaced with their
4812  * lowercase counterparts. The operation is locale insensitive---only
4813  * characters ``A'' to ``Z'' are affected.
4814  * Note: case replacement is effective only in ASCII region.
4815  *
4816  * "hEllO".downcase #=> "hello"
4817  */
4818 
4819 static VALUE
4821 {
4822  str = rb_str_dup(str);
4823  rb_str_downcase_bang(str);
4824  return str;
4825 }
4826 
4827 
4828 /*
4829  * call-seq:
4830  * str.capitalize! -> str or nil
4831  *
4832  * Modifies <i>str</i> by converting the first character to uppercase and the
4833  * remainder to lowercase. Returns <code>nil</code> if no changes are made.
4834  * Note: case conversion is effective only in ASCII region.
4835  *
4836  * a = "hello"
4837  * a.capitalize! #=> "Hello"
4838  * a #=> "Hello"
4839  * a.capitalize! #=> nil
4840  */
4841 
4842 static VALUE
4844 {
4845  rb_encoding *enc;
4846  char *s, *send;
4847  int modify = 0;
4848  unsigned int c;
4849  int n;
4850 
4851  str_modify_keep_cr(str);
4852  enc = STR_ENC_GET(str);
4854  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4855  s = RSTRING_PTR(str); send = RSTRING_END(str);
4856 
4857  c = rb_enc_codepoint_len(s, send, &n, enc);
4858  if (rb_enc_islower(c, enc)) {
4859  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4860  modify = 1;
4861  }
4862  s += n;
4863  while (s < send) {
4864  c = rb_enc_codepoint_len(s, send, &n, enc);
4865  if (rb_enc_isupper(c, enc)) {
4866  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4867  modify = 1;
4868  }
4869  s += n;
4870  }
4871 
4872  if (modify) return str;
4873  return Qnil;
4874 }
4875 
4876 
4877 /*
4878  * call-seq:
4879  * str.capitalize -> new_str
4880  *
4881  * Returns a copy of <i>str</i> with the first character converted to uppercase
4882  * and the remainder to lowercase.
4883  * Note: case conversion is effective only in ASCII region.
4884  *
4885  * "hello".capitalize #=> "Hello"
4886  * "HELLO".capitalize #=> "Hello"
4887  * "123ABC".capitalize #=> "123abc"
4888  */
4889 
4890 static VALUE
4892 {
4893  str = rb_str_dup(str);
4895  return str;
4896 }
4897 
4898 
4899 /*
4900  * call-seq:
4901  * str.swapcase! -> str or nil
4902  *
4903  * Equivalent to <code>String#swapcase</code>, but modifies the receiver in
4904  * place, returning <i>str</i>, or <code>nil</code> if no changes were made.
4905  * Note: case conversion is effective only in ASCII region.
4906  */
4907 
4908 static VALUE
4910 {
4911  rb_encoding *enc;
4912  char *s, *send;
4913  int modify = 0;
4914  int n;
4915 
4916  str_modify_keep_cr(str);
4917  enc = STR_ENC_GET(str);
4919  s = RSTRING_PTR(str); send = RSTRING_END(str);
4920  while (s < send) {
4921  unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
4922 
4923  if (rb_enc_isupper(c, enc)) {
4924  /* assuming toupper returns codepoint with same size */
4925  rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4926  modify = 1;
4927  }
4928  else if (rb_enc_islower(c, enc)) {
4929  /* assuming tolower returns codepoint with same size */
4930  rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4931  modify = 1;
4932  }
4933  s += n;
4934  }
4935 
4936  if (modify) return str;
4937  return Qnil;
4938 }
4939 
4940 
4941 /*
4942  * call-seq:
4943  * str.swapcase -> new_str
4944  *
4945  * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
4946  * to lowercase and lowercase characters converted to uppercase.
4947  * Note: case conversion is effective only in ASCII region.
4948  *
4949  * "Hello".swapcase #=> "hELLO"
4950  * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
4951  */
4952 
4953 static VALUE
4955 {
4956  str = rb_str_dup(str);
4957  rb_str_swapcase_bang(str);
4958  return str;
4959 }
4960 
4961 typedef unsigned char *USTR;
4962 
4963 struct tr {
4964  int gen;
4965  unsigned int now, max;
4966  char *p, *pend;
4967 };
4968 
4969 static unsigned int
4970 trnext(struct tr *t, rb_encoding *enc)
4971 {
4972  int n;
4973 
4974  for (;;) {
4975  if (!t->gen) {
4976  if (t->p == t->pend) return -1;
4977  if (t->p < t->pend - 1 && *t->p == '\\') {
4978  t->p++;
4979  }
4980  t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
4981  t->p += n;
4982  if (t->p < t->pend - 1 && *t->p == '-') {
4983  t->p++;
4984  if (t->p < t->pend) {
4985  unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
4986  t->p += n;
4987  if (t->now > c) {
4988  if (t->now < 0x80 && c < 0x80) {
4990  "invalid range \"%c-%c\" in string transliteration",
4991  t->now, c);
4992  }
4993  else {
4994  rb_raise(rb_eArgError, "invalid range in string transliteration");
4995  }
4996  continue; /* not reached */
4997  }
4998  t->gen = 1;
4999  t->max = c;
5000  }
5001  }
5002  return t->now;
5003  }
5004  else if (++t->now < t->max) {
5005  return t->now;
5006  }
5007  else {
5008  t->gen = 0;
5009  return t->max;
5010  }
5011  }
5012 }
5013 
5014 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
5015 
5016 static VALUE
5017 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
5018 {
5019  const unsigned int errc = -1;
5020  unsigned int trans[256];
5021  rb_encoding *enc, *e1, *e2;
5022  struct tr trsrc, trrepl;
5023  int cflag = 0;
5024  unsigned int c, c0, last = 0;
5025  int modify = 0, i, l;
5026  char *s, *send;
5027  VALUE hash = 0;
5028  int singlebyte = single_byte_optimizable(str);
5029  int cr;
5030 
5031 #define CHECK_IF_ASCII(c) \
5032  (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
5033  (cr = ENC_CODERANGE_VALID) : 0)
5034 
5035  StringValue(src);
5036  StringValue(repl);
5037  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5038  if (RSTRING_LEN(repl) == 0) {
5039  return rb_str_delete_bang(1, &src, str);
5040  }
5041 
5042  cr = ENC_CODERANGE(str);
5043  e1 = rb_enc_check(str, src);
5044  e2 = rb_enc_check(str, repl);
5045  if (e1 == e2) {
5046  enc = e1;
5047  }
5048  else {
5049  enc = rb_enc_check(src, repl);
5050  }
5051  trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
5052  if (RSTRING_LEN(src) > 1 &&
5053  rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
5054  trsrc.p + l < trsrc.pend) {
5055  cflag = 1;
5056  trsrc.p += l;
5057  }
5058  trrepl.p = RSTRING_PTR(repl);
5059  trrepl.pend = trrepl.p + RSTRING_LEN(repl);
5060  trsrc.gen = trrepl.gen = 0;
5061  trsrc.now = trrepl.now = 0;
5062  trsrc.max = trrepl.max = 0;
5063 
5064  if (cflag) {
5065  for (i=0; i<256; i++) {
5066  trans[i] = 1;
5067  }
5068  while ((c = trnext(&trsrc, enc)) != errc) {
5069  if (c < 256) {
5070  trans[c] = errc;
5071  }
5072  else {
5073  if (!hash) hash = rb_hash_new();
5074  rb_hash_aset(hash, UINT2NUM(c), Qtrue);
5075  }
5076  }
5077  while ((c = trnext(&trrepl, enc)) != errc)
5078  /* retrieve last replacer */;
5079  last = trrepl.now;
5080  for (i=0; i<256; i++) {
5081  if (trans[i] != errc) {
5082  trans[i] = last;
5083  }
5084  }
5085  }
5086  else {
5087  unsigned int r;
5088 
5089  for (i=0; i<256; i++) {
5090  trans[i] = errc;
5091  }
5092  while ((c = trnext(&trsrc, enc)) != errc) {
5093  r = trnext(&trrepl, enc);
5094  if (r == errc) r = trrepl.now;
5095  if (c < 256) {
5096  trans[c] = r;
5097  if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
5098  }
5099  else {
5100  if (!hash) hash = rb_hash_new();
5101  rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
5102  }
5103  }
5104  }
5105 
5106  if (cr == ENC_CODERANGE_VALID)
5107  cr = ENC_CODERANGE_7BIT;
5108  str_modify_keep_cr(str);
5109  s = RSTRING_PTR(str); send = RSTRING_END(str);
5110  if (sflag) {
5111  int clen, tlen;
5112  long offset, max = RSTRING_LEN(str);
5113  unsigned int save = -1;
5114  char *buf = ALLOC_N(char, max), *t = buf;
5115 
5116  while (s < send) {
5117  int may_modify = 0;
5118 
5119  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5120  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5121 
5122  s += clen;
5123  if (c < 256) {
5124  c = trans[c];
5125  }
5126  else if (hash) {
5127  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5128  if (NIL_P(tmp)) {
5129  if (cflag) c = last;
5130  else c = errc;
5131  }
5132  else if (cflag) c = errc;
5133  else c = NUM2INT(tmp);
5134  }
5135  else {
5136  c = errc;
5137  }
5138  if (c != (unsigned int)-1) {
5139  if (save == c) {
5140  CHECK_IF_ASCII(c);
5141  continue;
5142  }
5143  save = c;
5144  tlen = rb_enc_codelen(c, enc);
5145  modify = 1;
5146  }
5147  else {
5148  save = -1;
5149  c = c0;
5150  if (enc != e1) may_modify = 1;
5151  }
5152  while (t - buf + tlen >= max) {
5153  offset = t - buf;
5154  max *= 2;
5155  REALLOC_N(buf, char, max);
5156  t = buf + offset;
5157  }
5158  rb_enc_mbcput(c, t, enc);
5159  if (may_modify && memcmp(s, t, tlen) != 0) {
5160  modify = 1;
5161  }
5162  CHECK_IF_ASCII(c);
5163  t += tlen;
5164  }
5165  if (!STR_EMBED_P(str)) {
5166  xfree(RSTRING(str)->as.heap.ptr);
5167  }
5168  *t = '\0';
5169  RSTRING(str)->as.heap.ptr = buf;
5170  RSTRING(str)->as.heap.len = t - buf;
5171  STR_SET_NOEMBED(str);
5172  RSTRING(str)->as.heap.aux.capa = max;
5173  }
5174  else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
5175  while (s < send) {
5176  c = (unsigned char)*s;
5177  if (trans[c] != errc) {
5178  if (!cflag) {
5179  c = trans[c];
5180  *s = c;
5181  modify = 1;
5182  }
5183  else {
5184  *s = last;
5185  modify = 1;
5186  }
5187  }
5188  CHECK_IF_ASCII(c);
5189  s++;
5190  }
5191  }
5192  else {
5193  int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
5194  long offset;
5195  char *buf = ALLOC_N(char, max), *t = buf;
5196 
5197  while (s < send) {
5198  int may_modify = 0;
5199  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
5200  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
5201 
5202  if (c < 256) {
5203  c = trans[c];
5204  }
5205  else if (hash) {
5206  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
5207  if (NIL_P(tmp)) {
5208  if (cflag) c = last;
5209  else c = errc;
5210  }
5211  else if (cflag) c = errc;
5212  else c = NUM2INT(tmp);
5213  }
5214  else {
5215  c = cflag ? last : errc;
5216  }
5217  if (c != errc) {
5218  tlen = rb_enc_codelen(c, enc);
5219  modify = 1;
5220  }
5221  else {
5222  c = c0;
5223  if (enc != e1) may_modify = 1;
5224  }
5225  while (t - buf + tlen >= max) {
5226  offset = t - buf;
5227  max *= 2;
5228  REALLOC_N(buf, char, max);
5229  t = buf + offset;
5230  }
5231  if (s != t) {
5232  rb_enc_mbcput(c, t, enc);
5233  if (may_modify && memcmp(s, t, tlen) != 0) {
5234  modify = 1;
5235  }
5236  }
5237  CHECK_IF_ASCII(c);
5238  s += clen;
5239  t += tlen;
5240  }
5241  if (!STR_EMBED_P(str)) {
5242  xfree(RSTRING(str)->as.heap.ptr);
5243  }
5244  *t = '\0';
5245  RSTRING(str)->as.heap.ptr = buf;
5246  RSTRING(str)->as.heap.len = t - buf;
5247  STR_SET_NOEMBED(str);
5248  RSTRING(str)->as.heap.aux.capa = max;
5249  }
5250 
5251  if (modify) {
5252  if (cr != ENC_CODERANGE_BROKEN)
5253  ENC_CODERANGE_SET(str, cr);
5254  rb_enc_associate(str, enc);
5255  return str;
5256  }
5257  return Qnil;
5258 }
5259 
5260 
5261 /*
5262  * call-seq:
5263  * str.tr!(from_str, to_str) -> str or nil
5264  *
5265  * Translates <i>str</i> in place, using the same rules as
5266  * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
5267  * changes were made.
5268  */
5269 
5270 static VALUE
5272 {
5273  return tr_trans(str, src, repl, 0);
5274 }
5275 
5276 
5277 /*
5278  * call-seq:
5279  * str.tr(from_str, to_str) => new_str
5280  *
5281  * Returns a copy of <i>str</i> with the characters in <i>from_str</i>
5282  * replaced by the corresponding characters in <i>to_str</i>. If
5283  * <i>to_str</i> is shorter than <i>from_str</i>, it is padded with its last
5284  * character in order to maintain the correspondence.
5285  *
5286  * "hello".tr('el', 'ip') #=> "hippo"
5287  * "hello".tr('aeiou', '*') #=> "h*ll*"
5288  *
5289  * Both strings may use the c1-c2 notation to denote ranges of characters,
5290  * and <i>from_str</i> may start with a <code>^</code>, which denotes all
5291  * characters except those listed.
5292  *
5293  * "hello".tr('a-y', 'b-z') #=> "ifmmp"
5294  * "hello".tr('^aeiou', '*') #=> "*e**o"
5295  */
5296 
5297 static VALUE
5298 rb_str_tr(VALUE str, VALUE src, VALUE repl)
5299 {
5300  str = rb_str_dup(str);
5301  tr_trans(str, src, repl, 0);
5302  return str;
5303 }
5304 
5305 #define TR_TABLE_SIZE 257
5306 static void
5307 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
5308  VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
5309 {
5310  const unsigned int errc = -1;
5311  char buf[256];
5312  struct tr tr;
5313  unsigned int c;
5314  VALUE table = 0, ptable = 0;
5315  int i, l, cflag = 0;
5316 
5317  tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
5318  tr.gen = tr.now = tr.max = 0;
5319 
5320  if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
5321  cflag = 1;
5322  tr.p += l;
5323  }
5324  if (first) {
5325  for (i=0; i<256; i++) {
5326  stable[i] = 1;
5327  }
5328  stable[256] = cflag;
5329  }
5330  else if (stable[256] && !cflag) {
5331  stable[256] = 0;
5332  }
5333  for (i=0; i<256; i++) {
5334  buf[i] = cflag;
5335  }
5336 
5337  while ((c = trnext(&tr, enc)) != errc) {
5338  if (c < 256) {
5339  buf[c & 0xff] = !cflag;
5340  }
5341  else {
5342  VALUE key = UINT2NUM(c);
5343 
5344  if (!table) {
5345  table = rb_hash_new();
5346  if (cflag) {
5347  ptable = *ctablep;
5348  *ctablep = table;
5349  }
5350  else {
5351  ptable = *tablep;
5352  *tablep = table;
5353  }
5354  }
5355  if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
5356  rb_hash_aset(table, key, Qtrue);
5357  }
5358  }
5359  }
5360  for (i=0; i<256; i++) {
5361  stable[i] = stable[i] && buf[i];
5362  }
5363 }
5364 
5365 
5366 static int
5367 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
5368 {
5369  if (c < 256) {
5370  return table[c] != 0;
5371  }
5372  else {
5373  VALUE v = UINT2NUM(c);
5374 
5375  if (del) {
5376  if (!NIL_P(rb_hash_lookup(del, v)) &&
5377  (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
5378  return TRUE;
5379  }
5380  }
5381  else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
5382  return FALSE;
5383  }
5384  return table[256] ? TRUE : FALSE;
5385  }
5386 }
5387 
5388 /*
5389  * call-seq:
5390  * str.delete!([other_str]+) -> str or nil
5391  *
5392  * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
5393  * <code>nil</code> if <i>str</i> was not modified.
5394  */
5395 
5396 static VALUE
5398 {
5399  char squeez[TR_TABLE_SIZE];
5400  rb_encoding *enc = 0;
5401  char *s, *send, *t;
5402  VALUE del = 0, nodel = 0;
5403  int modify = 0;
5404  int i, ascompat, cr;
5405 
5406  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
5407  if (argc < 1) {
5408  rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
5409  }
5410  for (i=0; i<argc; i++) {
5411  VALUE s = argv[i];
5412 
5413  StringValue(s);
5414  enc = rb_enc_check(str, s);
5415  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5416  }
5417 
5418  str_modify_keep_cr(str);
5419  ascompat = rb_enc_asciicompat(enc);
5420  s = t = RSTRING_PTR(str);
5421  send = RSTRING_END(str);
5422  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
5423  while (s < send) {
5424  unsigned int c;
5425  int clen;
5426 
5427  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5428  if (squeez[c]) {
5429  modify = 1;
5430  }
5431  else {
5432  if (t != s) *t = c;
5433  t++;
5434  }
5435  s++;
5436  }
5437  else {
5438  c = rb_enc_codepoint_len(s, send, &clen, enc);
5439 
5440  if (tr_find(c, squeez, del, nodel)) {
5441  modify = 1;
5442  }
5443  else {
5444  if (t != s) rb_enc_mbcput(c, t, enc);
5445  t += clen;
5446  if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
5447  }
5448  s += clen;
5449  }
5450  }
5451  *t = '\0';
5452  STR_SET_LEN(str, t - RSTRING_PTR(str));
5453  ENC_CODERANGE_SET(str, cr);
5454 
5455  if (modify) return str;
5456  return Qnil;
5457 }
5458 
5459 
5460 /*
5461  * call-seq:
5462  * str.delete([other_str]+) -> new_str
5463  *
5464  * Returns a copy of <i>str</i> with all characters in the intersection of its
5465  * arguments deleted. Uses the same rules for building the set of characters as
5466  * <code>String#count</code>.
5467  *
5468  * "hello".delete "l","lo" #=> "heo"
5469  * "hello".delete "lo" #=> "he"
5470  * "hello".delete "aeiou", "^e" #=> "hell"
5471  * "hello".delete "ej-m" #=> "ho"
5472  */
5473 
5474 static VALUE
5476 {
5477  str = rb_str_dup(str);
5478  rb_str_delete_bang(argc, argv, str);
5479  return str;
5480 }
5481 
5482 
5483 /*
5484  * call-seq:
5485  * str.squeeze!([other_str]*) -> str or nil
5486  *
5487  * Squeezes <i>str</i> in place, returning either <i>str</i>, or
5488  * <code>nil</code> if no changes were made.
5489  */
5490 
5491 static VALUE
5493 {
5494  char squeez[TR_TABLE_SIZE];
5495  rb_encoding *enc = 0;
5496  VALUE del = 0, nodel = 0;
5497  char *s, *send, *t;
5498  int i, modify = 0;
5499  int ascompat, singlebyte = single_byte_optimizable(str);
5500  unsigned int save;
5501 
5502  if (argc == 0) {
5503  enc = STR_ENC_GET(str);
5504  }
5505  else {
5506  for (i=0; i<argc; i++) {
5507  VALUE s = argv[i];
5508 
5509  StringValue(s);
5510  enc = rb_enc_check(str, s);
5511  if (singlebyte && !single_byte_optimizable(s))
5512  singlebyte = 0;
5513  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
5514  }
5515  }
5516 
5517  str_modify_keep_cr(str);
5518  s = t = RSTRING_PTR(str);
5519  if (!s || RSTRING_LEN(str) == 0) return Qnil;
5520  send = RSTRING_END(str);
5521  save = -1;
5522  ascompat = rb_enc_asciicompat(enc);
5523 
5524  if (singlebyte) {
5525  while (s < send) {
5526  unsigned int c = *(unsigned char*)s++;
5527  if (c != save || (argc > 0 && !squeez[c])) {
5528  *t++ = save = c;
5529  }
5530  }
5531  } else {
5532  while (s < send) {
5533  unsigned int c;
5534  int clen;
5535 
5536  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5537  if (c != save || (argc > 0 && !squeez[c])) {
5538  *t++ = save = c;
5539  }
5540  s++;
5541  }
5542  else {
5543  c = rb_enc_codepoint_len(s, send, &clen, enc);
5544 
5545  if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
5546  if (t != s) rb_enc_mbcput(c, t, enc);
5547  save = c;
5548  t += clen;
5549  }
5550  s += clen;
5551  }
5552  }
5553  }
5554 
5555  *t = '\0';
5556  if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
5557  STR_SET_LEN(str, t - RSTRING_PTR(str));
5558  modify = 1;
5559  }
5560 
5561  if (modify) return str;
5562  return Qnil;
5563 }
5564 
5565 
5566 /*
5567  * call-seq:
5568  * str.squeeze([other_str]*) -> new_str
5569  *
5570  * Builds a set of characters from the <i>other_str</i> parameter(s) using the
5571  * procedure described for <code>String#count</code>. Returns a new string
5572  * where runs of the same character that occur in this set are replaced by a
5573  * single character. If no arguments are given, all runs of identical
5574  * characters are replaced by a single character.
5575  *
5576  * "yellow moon".squeeze #=> "yelow mon"
5577  * " now is the".squeeze(" ") #=> " now is the"
5578  * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
5579  */
5580 
5581 static VALUE
5583 {
5584  str = rb_str_dup(str);
5585  rb_str_squeeze_bang(argc, argv, str);
5586  return str;
5587 }
5588 
5589 
5590 /*
5591  * call-seq:
5592  * str.tr_s!(from_str, to_str) -> str or nil
5593  *
5594  * Performs <code>String#tr_s</code> processing on <i>str</i> in place,
5595  * returning <i>str</i>, or <code>nil</code> if no changes were made.
5596  */
5597 
5598 static VALUE
5600 {
5601  return tr_trans(str, src, repl, 1);
5602 }
5603 
5604 
5605 /*
5606  * call-seq:
5607  * str.tr_s(from_str, to_str) -> new_str
5608  *
5609  * Processes a copy of <i>str</i> as described under <code>String#tr</code>,
5610  * then removes duplicate characters in regions that were affected by the
5611  * translation.
5612  *
5613  * "hello".tr_s('l', 'r') #=> "hero"
5614  * "hello".tr_s('el', '*') #=> "h*o"
5615  * "hello".tr_s('el', 'hx') #=> "hhxo"
5616  */
5617 
5618 static VALUE
5619 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
5620 {
5621  str = rb_str_dup(str);
5622  tr_trans(str, src, repl, 1);
5623  return str;
5624 }
5625 
5626 
5627 /*
5628  * call-seq:
5629  * str.count([other_str]+) -> fixnum
5630  *
5631  * Each <i>other_str</i> parameter defines a set of characters to count. The
5632  * intersection of these sets defines the characters to count in
5633  * <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
5634  * negated. The sequence c1--c2 means all characters between c1 and c2.
5635  *
5636  * a = "hello world"
5637  * a.count "lo" #=> 5
5638  * a.count "lo", "o" #=> 2
5639  * a.count "hello", "^l" #=> 4
5640  * a.count "ej-m" #=> 4
5641  */
5642 
5643 static VALUE
5645 {
5646  char table[TR_TABLE_SIZE];
5647  rb_encoding *enc = 0;
5648  VALUE del = 0, nodel = 0;
5649  char *s, *send;
5650  int i;
5651  int ascompat;
5652 
5653  if (argc < 1) {
5654  rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
5655  }
5656  for (i=0; i<argc; i++) {
5657  VALUE tstr = argv[i];
5658  unsigned char c;
5659 
5660  StringValue(tstr);
5661  enc = rb_enc_check(str, tstr);
5662  if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
5663  (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
5664  int n = 0;
5665 
5666  s = RSTRING_PTR(str);
5667  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5668  send = RSTRING_END(str);
5669  while (s < send) {
5670  if (*(unsigned char*)s++ == c) n++;
5671  }
5672  return INT2NUM(n);
5673  }
5674  tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
5675  }
5676 
5677  s = RSTRING_PTR(str);
5678  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
5679  send = RSTRING_END(str);
5680  ascompat = rb_enc_asciicompat(enc);
5681  i = 0;
5682  while (s < send) {
5683  unsigned int c;
5684 
5685  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
5686  if (table[c]) {
5687  i++;
5688  }
5689  s++;
5690  }
5691  else {
5692  int clen;
5693  c = rb_enc_codepoint_len(s, send, &clen, enc);
5694  if (tr_find(c, table, del, nodel)) {
5695  i++;
5696  }
5697  s += clen;
5698  }
5699  }
5700 
5701  return INT2NUM(i);
5702 }
5703 
5704 static const char isspacetable[256] = {
5705  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
5706  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5707  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5708  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5709  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5710  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5711  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5712  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5713  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5714  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5715  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5716  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5717  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5718  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5719  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5720  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5721 };
5722 
5723 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
5724 
5725 /*
5726  * call-seq:
5727  * str.split(pattern=$;, [limit]) -> anArray
5728  *
5729  * Divides <i>str</i> into substrings based on a delimiter, returning an array
5730  * of these substrings.
5731  *
5732  * If <i>pattern</i> is a <code>String</code>, then its contents are used as
5733  * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
5734  * space, <i>str</i> is split on whitespace, with leading whitespace and runs
5735  * of contiguous whitespace characters ignored.
5736  *
5737  * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
5738  * pattern matches. Whenever the pattern matches a zero-length string,
5739  * <i>str</i> is split into individual characters. If <i>pattern</i> contains
5740  * groups, the respective matches will be returned in the array as well.
5741  *
5742  * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If
5743  * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
5744  * split on whitespace as if ` ' were specified.
5745  *
5746  * If the <i>limit</i> parameter is omitted, trailing null fields are
5747  * suppressed. If <i>limit</i> is a positive number, at most that number of
5748  * fields will be returned (if <i>limit</i> is <code>1</code>, the entire
5749  * string is returned as the only entry in an array). If negative, there is no
5750  * limit to the number of fields returned, and trailing null fields are not
5751  * suppressed.
5752  *
5753  * " now's the time".split #=> ["now's", "the", "time"]
5754  * " now's the time".split(' ') #=> ["now's", "the", "time"]
5755  * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
5756  * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
5757  * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
5758  * "hello".split(//, 3) #=> ["h", "e", "llo"]
5759  * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
5760  *
5761  * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
5762  * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
5763  * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
5764  * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
5765  */
5766 
5767 static VALUE
5769 {
5770  rb_encoding *enc;
5771  VALUE spat;
5772  VALUE limit;
5773  enum {awk, string, regexp} split_type;
5774  long beg, end, i = 0;
5775  int lim = 0;
5776  VALUE result, tmp;
5777 
5778  if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
5779  lim = NUM2INT(limit);
5780  if (lim <= 0) limit = Qnil;
5781  else if (lim == 1) {
5782  if (RSTRING_LEN(str) == 0)
5783  return rb_ary_new2(0);
5784  return rb_ary_new3(1, str);
5785  }
5786  i = 1;
5787  }
5788 
5789  enc = STR_ENC_GET(str);
5790  if (NIL_P(spat)) {
5791  if (!NIL_P(rb_fs)) {
5792  spat = rb_fs;
5793  goto fs_set;
5794  }
5795  split_type = awk;
5796  }
5797  else {
5798  fs_set:
5799  if (TYPE(spat) == T_STRING) {
5800  rb_encoding *enc2 = STR_ENC_GET(spat);
5801 
5802  split_type = string;
5803  if (RSTRING_LEN(spat) == 0) {
5804  /* Special case - split into chars */
5805  spat = rb_reg_regcomp(spat);
5806  split_type = regexp;
5807  }
5808  else if (rb_enc_asciicompat(enc2) == 1) {
5809  if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
5810  split_type = awk;
5811  }
5812  }
5813  else {
5814  int l;
5815  if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
5816  RSTRING_LEN(spat) == l) {
5817  split_type = awk;
5818  }
5819  }
5820  }
5821  else {
5822  spat = get_pat(spat, 1);
5823  split_type = regexp;
5824  }
5825  }
5826 
5827  result = rb_ary_new();
5828  beg = 0;
5829  if (split_type == awk) {
5830  char *ptr = RSTRING_PTR(str);
5831  char *eptr = RSTRING_END(str);
5832  char *bptr = ptr;
5833  int skip = 1;
5834  unsigned int c;
5835 
5836  end = beg;
5837  if (is_ascii_string(str)) {
5838  while (ptr < eptr) {
5839  c = (unsigned char)*ptr++;
5840  if (skip) {
5841  if (ascii_isspace(c)) {
5842  beg = ptr - bptr;
5843  }
5844  else {
5845  end = ptr - bptr;
5846  skip = 0;
5847  if (!NIL_P(limit) && lim <= i) break;
5848  }
5849  }
5850  else if (ascii_isspace(c)) {
5851  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5852  skip = 1;
5853  beg = ptr - bptr;
5854  if (!NIL_P(limit)) ++i;
5855  }
5856  else {
5857  end = ptr - bptr;
5858  }
5859  }
5860  }
5861  else {
5862  while (ptr < eptr) {
5863  int n;
5864 
5865  c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
5866  ptr += n;
5867  if (skip) {
5868  if (rb_isspace(c)) {
5869  beg = ptr - bptr;
5870  }
5871  else {
5872  end = ptr - bptr;
5873  skip = 0;
5874  if (!NIL_P(limit) && lim <= i) break;
5875  }
5876  }
5877  else if (rb_isspace(c)) {
5878  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5879  skip = 1;
5880  beg = ptr - bptr;
5881  if (!NIL_P(limit)) ++i;
5882  }
5883  else {
5884  end = ptr - bptr;
5885  }
5886  }
5887  }
5888  }
5889  else if (split_type == string) {
5890  char *ptr = RSTRING_PTR(str);
5891  char *temp = ptr;
5892  char *eptr = RSTRING_END(str);
5893  char *sptr = RSTRING_PTR(spat);
5894  long slen = RSTRING_LEN(spat);
5895 
5896  if (is_broken_string(str)) {
5897  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
5898  }
5899  if (is_broken_string(spat)) {
5900  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
5901  }
5902  enc = rb_enc_check(str, spat);
5903  while (ptr < eptr &&
5904  (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
5905  /* Check we are at the start of a char */
5906  char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
5907  if (t != ptr + end) {
5908  ptr = t;
5909  continue;
5910  }
5911  rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
5912  ptr += end + slen;
5913  if (!NIL_P(limit) && lim <= ++i) break;
5914  }
5915  beg = ptr - temp;
5916  }
5917  else {
5918  char *ptr = RSTRING_PTR(str);
5919  long len = RSTRING_LEN(str);
5920  long start = beg;
5921  long idx;
5922  int last_null = 0;
5923  struct re_registers *regs;
5924 
5925  while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
5926  regs = RMATCH_REGS(rb_backref_get());
5927  if (start == end && BEG(0) == END(0)) {
5928  if (!ptr) {
5929  rb_ary_push(result, str_new_empty(str));
5930  break;
5931  }
5932  else if (last_null == 1) {
5933  rb_ary_push(result, rb_str_subseq(str, beg,
5934  rb_enc_fast_mbclen(ptr+beg,
5935  ptr+len,
5936  enc)));
5937  beg = start;
5938  }
5939  else {
5940  if (ptr+start == ptr+len)
5941  start++;
5942  else
5943  start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
5944  last_null = 1;
5945  continue;
5946  }
5947  }
5948  else {
5949  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5950  beg = start = END(0);
5951  }
5952  last_null = 0;
5953 
5954  for (idx=1; idx < regs->num_regs; idx++) {
5955  if (BEG(idx) == -1) continue;
5956  if (BEG(idx) == END(idx))
5957  tmp = str_new_empty(str);
5958  else
5959  tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
5960  rb_ary_push(result, tmp);
5961  }
5962  if (!NIL_P(limit) && lim <= ++i) break;
5963  }
5964  }
5965  if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
5966  if (RSTRING_LEN(str) == beg)
5967  tmp = str_new_empty(str);
5968  else
5969  tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
5970  rb_ary_push(result, tmp);
5971  }
5972  if (NIL_P(limit) && lim == 0) {
5973  long len;
5974  while ((len = RARRAY_LEN(result)) > 0 &&
5975  (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
5976  rb_ary_pop(result);
5977  }
5978 
5979  return result;
5980 }
5981 
5982 VALUE
5983 rb_str_split(VALUE str, const char *sep0)
5984 {
5985  VALUE sep;
5986 
5987  StringValue(str);
5988  sep = rb_str_new2(sep0);
5989  return rb_str_split_m(1, &sep, str);
5990 }
5991 
5992 
5993 /*
5994  * call-seq:
5995  * str.each_line(separator=$/) {|substr| block } -> str
5996  * str.each_line(separator=$/) -> an_enumerator
5997  *
5998  * str.lines(separator=$/) {|substr| block } -> str
5999  * str.lines(separator=$/) -> an_enumerator
6000  *
6001  * Splits <i>str</i> using the supplied parameter as the record separator
6002  * (<code>$/</code> by default), passing each substring in turn to the supplied
6003  * block. If a zero-length record separator is supplied, the string is split
6004  * into paragraphs delimited by multiple successive newlines.
6005  *
6006  * If no block is given, an enumerator is returned instead.
6007  *
6008  * print "Example one\n"
6009  * "hello\nworld".each_line {|s| p s}
6010  * print "Example two\n"
6011  * "hello\nworld".each_line('l') {|s| p s}
6012  * print "Example three\n"
6013  * "hello\n\n\nworld".each_line('') {|s| p s}
6014  *
6015  * <em>produces:</em>
6016  *
6017  * Example one
6018  * "hello\n"
6019  * "world"
6020  * Example two
6021  * "hel"
6022  * "l"
6023  * "o\nworl"
6024  * "d"
6025  * Example three
6026  * "hello\n\n\n"
6027  * "world"
6028  */
6029 
6030 static VALUE
6032 {
6033  rb_encoding *enc;
6034  VALUE rs;
6035  unsigned int newline;
6036  const char *p, *pend, *s, *ptr;
6037  long len, rslen;
6038  VALUE line;
6039  int n;
6040  VALUE orig = str;
6041 
6042  if (argc == 0) {
6043  rs = rb_rs;
6044  }
6045  else {
6046  rb_scan_args(argc, argv, "01", &rs);
6047  }
6048  RETURN_ENUMERATOR(str, argc, argv);
6049  if (NIL_P(rs)) {
6050  rb_yield(str);
6051  return orig;
6052  }
6053  str = rb_str_new4(str);
6054  ptr = p = s = RSTRING_PTR(str);
6055  pend = p + RSTRING_LEN(str);
6056  len = RSTRING_LEN(str);
6057  StringValue(rs);
6058  if (rs == rb_default_rs) {
6059  enc = rb_enc_get(str);
6060  while (p < pend) {
6061  char *p0;
6062 
6063  p = memchr(p, '\n', pend - p);
6064  if (!p) break;
6065  p0 = rb_enc_left_char_head(s, p, pend, enc);
6066  if (!rb_enc_is_newline(p0, pend, enc)) {
6067  p++;
6068  continue;
6069  }
6070  p = p0 + rb_enc_mbclen(p0, pend, enc);
6071  line = rb_str_new5(str, s, p - s);
6072  OBJ_INFECT(line, str);
6073  rb_enc_cr_str_copy_for_substr(line, str);
6074  rb_yield(line);
6075  str_mod_check(str, ptr, len);
6076  s = p;
6077  }
6078  goto finish;
6079  }
6080 
6081  enc = rb_enc_check(str, rs);
6082  rslen = RSTRING_LEN(rs);
6083  if (rslen == 0) {
6084  newline = '\n';
6085  }
6086  else {
6087  newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
6088  }
6089 
6090  while (p < pend) {
6091  unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
6092 
6093  again:
6094  if (rslen == 0 && c == newline) {
6095  p += n;
6096  if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
6097  goto again;
6098  }
6099  while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
6100  p += n;
6101  }
6102  p -= n;
6103  }
6104  if (c == newline &&
6105  (rslen <= 1 ||
6106  (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
6107  line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
6108  OBJ_INFECT(line, str);
6109  rb_enc_cr_str_copy_for_substr(line, str);
6110  rb_yield(line);
6111  str_mod_check(str, ptr, len);
6112  s = p + (rslen ? rslen : n);
6113  }
6114  p += n;
6115  }
6116 
6117  finish:
6118  if (s != pend) {
6119  line = rb_str_new5(str, s, pend - s);
6120  OBJ_INFECT(line, str);
6121  rb_enc_cr_str_copy_for_substr(line, str);
6122  rb_yield(line);
6123  }
6124 
6125  return orig;
6126 }
6127 
6128 
6129 /*
6130  * call-seq:
6131  * str.bytes {|fixnum| block } -> str
6132  * str.bytes -> an_enumerator
6133  *
6134  * str.each_byte {|fixnum| block } -> str
6135  * str.each_byte -> an_enumerator
6136  *
6137  * Passes each byte in <i>str</i> to the given block, or returns
6138  * an enumerator if no block is given.
6139  *
6140  * "hello".each_byte {|c| print c, ' ' }
6141  *
6142  * <em>produces:</em>
6143  *
6144  * 104 101 108 108 111
6145  */
6146 
6147 static VALUE
6149 {
6150  long i;
6151 
6152  RETURN_ENUMERATOR(str, 0, 0);
6153  for (i=0; i<RSTRING_LEN(str); i++) {
6154  rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
6155  }
6156  return str;
6157 }
6158 
6159 
6160 /*
6161  * call-seq:
6162  * str.chars {|cstr| block } -> str
6163  * str.chars -> an_enumerator
6164  *
6165  * str.each_char {|cstr| block } -> str
6166  * str.each_char -> an_enumerator
6167  *
6168  * Passes each character in <i>str</i> to the given block, or returns
6169  * an enumerator if no block is given.
6170  *
6171  * "hello".each_char {|c| print c, ' ' }
6172  *
6173  * <em>produces:</em>
6174  *
6175  * h e l l o
6176  */
6177 
6178 static VALUE
6180 {
6181  VALUE orig = str;
6182  long i, len, n;
6183  const char *ptr;
6184  rb_encoding *enc;
6185 
6186  RETURN_ENUMERATOR(str, 0, 0);
6187  str = rb_str_new4(str);
6188  ptr = RSTRING_PTR(str);
6189  len = RSTRING_LEN(str);
6190  enc = rb_enc_get(str);
6191  switch (ENC_CODERANGE(str)) {
6192  case ENC_CODERANGE_VALID:
6193  case ENC_CODERANGE_7BIT:
6194  for (i = 0; i < len; i += n) {
6195  n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
6196  rb_yield(rb_str_subseq(str, i, n));
6197  }
6198  break;
6199  default:
6200  for (i = 0; i < len; i += n) {
6201  n = rb_enc_mbclen(ptr + i, ptr + len, enc);
6202  rb_yield(rb_str_subseq(str, i, n));
6203  }
6204  }
6205  return orig;
6206 }
6207 
6208 /*
6209  * call-seq:
6210  * str.codepoints {|integer| block } -> str
6211  * str.codepoints -> an_enumerator
6212  *
6213  * str.each_codepoint {|integer| block } -> str
6214  * str.each_codepoint -> an_enumerator
6215  *
6216  * Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
6217  * also known as a <i>codepoint</i> when applied to Unicode strings to the
6218  * given block.
6219  *
6220  * If no block is given, an enumerator is returned instead.
6221  *
6222  * "hello\u0639".each_codepoint {|c| print c, ' ' }
6223  *
6224  * <em>produces:</em>
6225  *
6226  * 104 101 108 108 111 1593
6227  */
6228 
6229 static VALUE
6231 {
6232  VALUE orig = str;
6233  int n;
6234  unsigned int c;
6235  const char *ptr, *end;
6236  rb_encoding *enc;
6237 
6238  if (single_byte_optimizable(str)) return rb_str_each_byte(str);
6239  RETURN_ENUMERATOR(str, 0, 0);
6240  str = rb_str_new4(str);
6241  ptr = RSTRING_PTR(str);
6242  end = RSTRING_END(str);
6243  enc = STR_ENC_GET(str);
6244  while (ptr < end) {
6245  c = rb_enc_codepoint_len(ptr, end, &n, enc);
6246  rb_yield(UINT2NUM(c));
6247  ptr += n;
6248  }
6249  return orig;
6250 }
6251 
6252 static long
6254 {
6255  rb_encoding *enc = STR_ENC_GET(str);
6256  const char *p, *p2, *beg, *end;
6257 
6258  beg = RSTRING_PTR(str);
6259  end = beg + RSTRING_LEN(str);
6260  if (beg > end) return 0;
6261  p = rb_enc_prev_char(beg, end, end, enc);
6262  if (!p) return 0;
6263  if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
6264  p2 = rb_enc_prev_char(beg, p, end, enc);
6265  if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
6266  }
6267  return p - beg;
6268 }
6269 
6270 /*
6271  * call-seq:
6272  * str.chop! -> str or nil
6273  *
6274  * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
6275  * or <code>nil</code> if <i>str</i> is the empty string. See also
6276  * <code>String#chomp!</code>.
6277  */
6278 
6279 static VALUE
6281 {
6282  str_modify_keep_cr(str);
6283  if (RSTRING_LEN(str) > 0) {
6284  long len;
6285  len = chopped_length(str);
6286  STR_SET_LEN(str, len);
6287  RSTRING_PTR(str)[len] = '\0';
6288  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6289  ENC_CODERANGE_CLEAR(str);
6290  }
6291  return str;
6292  }
6293  return Qnil;
6294 }
6295 
6296 
6297 /*
6298  * call-seq:
6299  * str.chop -> new_str
6300  *
6301  * Returns a new <code>String</code> with the last character removed. If the
6302  * string ends with <code>\r\n</code>, both characters are removed. Applying
6303  * <code>chop</code> to an empty string returns an empty
6304  * string. <code>String#chomp</code> is often a safer alternative, as it leaves
6305  * the string unchanged if it doesn't end in a record separator.
6306  *
6307  * "string\r\n".chop #=> "string"
6308  * "string\n\r".chop #=> "string\n"
6309  * "string\n".chop #=> "string"
6310  * "string".chop #=> "strin"
6311  * "x".chop.chop #=> ""
6312  */
6313 
6314 static VALUE
6316 {
6317  VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
6318  rb_enc_cr_str_copy_for_substr(str2, str);
6319  OBJ_INFECT(str2, str);
6320  return str2;
6321 }
6322 
6323 
6324 /*
6325  * call-seq:
6326  * str.chomp!(separator=$/) -> str or nil
6327  *
6328  * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
6329  * returning <i>str</i>, or <code>nil</code> if no modifications were made.
6330  */
6331 
6332 static VALUE
6334 {
6335  rb_encoding *enc;
6336  VALUE rs;
6337  int newline;
6338  char *p, *pp, *e;
6339  long len, rslen;
6340 
6341  str_modify_keep_cr(str);
6342  len = RSTRING_LEN(str);
6343  if (len == 0) return Qnil;
6344  p = RSTRING_PTR(str);
6345  e = p + len;
6346  if (argc == 0) {
6347  rs = rb_rs;
6348  if (rs == rb_default_rs) {
6349  smart_chomp:
6350  enc = rb_enc_get(str);
6351  if (rb_enc_mbminlen(enc) > 1) {
6352  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
6353  if (rb_enc_is_newline(pp, e, enc)) {
6354  e = pp;
6355  }
6356  pp = e - rb_enc_mbminlen(enc);
6357  if (pp >= p) {
6358  pp = rb_enc_left_char_head(p, pp, e, enc);
6359  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
6360  e = pp;
6361  }
6362  }
6363  if (e == RSTRING_END(str)) {
6364  return Qnil;
6365  }
6366  len = e - RSTRING_PTR(str);
6367  STR_SET_LEN(str, len);
6368  }
6369  else {
6370  if (RSTRING_PTR(str)[len-1] == '\n') {
6371  STR_DEC_LEN(str);
6372  if (RSTRING_LEN(str) > 0 &&
6373  RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
6374  STR_DEC_LEN(str);
6375  }
6376  }
6377  else if (RSTRING_PTR(str)[len-1] == '\r') {
6378  STR_DEC_LEN(str);
6379  }
6380  else {
6381  return Qnil;
6382  }
6383  }
6384  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6385  return str;
6386  }
6387  }
6388  else {
6389  rb_scan_args(argc, argv, "01", &rs);
6390  }
6391  if (NIL_P(rs)) return Qnil;
6392  StringValue(rs);
6393  rslen = RSTRING_LEN(rs);
6394  if (rslen == 0) {
6395  while (len>0 && p[len-1] == '\n') {
6396  len--;
6397  if (len>0 && p[len-1] == '\r')
6398  len--;
6399  }
6400  if (len < RSTRING_LEN(str)) {
6401  STR_SET_LEN(str, len);
6402  RSTRING_PTR(str)[len] = '\0';
6403  return str;
6404  }
6405  return Qnil;
6406  }
6407  if (rslen > len) return Qnil;
6408  newline = RSTRING_PTR(rs)[rslen-1];
6409  if (rslen == 1 && newline == '\n')
6410  goto smart_chomp;
6411 
6412  enc = rb_enc_check(str, rs);
6413  if (is_broken_string(rs)) {
6414  return Qnil;
6415  }
6416  pp = e - rslen;
6417  if (p[len-1] == newline &&
6418  (rslen <= 1 ||
6419  memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
6420  if (rb_enc_left_char_head(p, pp, e, enc) != pp)
6421  return Qnil;
6422  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
6423  ENC_CODERANGE_CLEAR(str);
6424  }
6425  STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
6426  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6427  return str;
6428  }
6429  return Qnil;
6430 }
6431 
6432 
6433 /*
6434  * call-seq:
6435  * str.chomp(separator=$/) -> new_str
6436  *
6437  * Returns a new <code>String</code> with the given record separator removed
6438  * from the end of <i>str</i> (if present). If <code>$/</code> has not been
6439  * changed from the default Ruby record separator, then <code>chomp</code> also
6440  * removes carriage return characters (that is it will remove <code>\n</code>,
6441  * <code>\r</code>, and <code>\r\n</code>).
6442  *
6443  * "hello".chomp #=> "hello"
6444  * "hello\n".chomp #=> "hello"
6445  * "hello\r\n".chomp #=> "hello"
6446  * "hello\n\r".chomp #=> "hello\n"
6447  * "hello\r".chomp #=> "hello"
6448  * "hello \n there".chomp #=> "hello \n there"
6449  * "hello".chomp("llo") #=> "he"
6450  */
6451 
6452 static VALUE
6454 {
6455  str = rb_str_dup(str);
6456  rb_str_chomp_bang(argc, argv, str);
6457  return str;
6458 }
6459 
6460 /*
6461  * call-seq:
6462  * str.lstrip! -> self or nil
6463  *
6464  * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
6465  * change was made. See also <code>String#rstrip!</code> and
6466  * <code>String#strip!</code>.
6467  *
6468  * " hello ".lstrip #=> "hello "
6469  * "hello".lstrip! #=> nil
6470  */
6471 
6472 static VALUE
6474 {
6475  rb_encoding *enc;
6476  char *s, *t, *e;
6477 
6478  str_modify_keep_cr(str);
6479  enc = STR_ENC_GET(str);
6480  s = RSTRING_PTR(str);
6481  if (!s || RSTRING_LEN(str) == 0) return Qnil;
6482  e = t = RSTRING_END(str);
6483  /* remove spaces at head */
6484  while (s < e) {
6485  int n;
6486  unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
6487 
6488  if (!rb_isspace(cc)) break;
6489  s += n;
6490  }
6491 
6492  if (s > RSTRING_PTR(str)) {
6493  STR_SET_LEN(str, t-s);
6494  memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
6495  RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
6496  return str;
6497  }
6498  return Qnil;
6499 }
6500 
6501 
6502 /*
6503  * call-seq:
6504  * str.lstrip -> new_str
6505  *
6506  * Returns a copy of <i>str</i> with leading whitespace removed. See also
6507  * <code>String#rstrip</code> and <code>String#strip</code>.
6508  *
6509  * " hello ".lstrip #=> "hello "
6510  * "hello".lstrip #=> "hello"
6511  */
6512 
6513 static VALUE
6515 {
6516  str = rb_str_dup(str);
6517  rb_str_lstrip_bang(str);
6518  return str;
6519 }
6520 
6521 
6522 /*
6523  * call-seq:
6524  * str.rstrip! -> self or nil
6525  *
6526  * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
6527  * no change was made. See also <code>String#lstrip!</code> and
6528  * <code>String#strip!</code>.
6529  *
6530  * " hello ".rstrip #=> " hello"
6531  * "hello".rstrip! #=> nil
6532  */
6533 
6534 static VALUE
6536 {
6537  rb_encoding *enc;
6538  char *s, *t, *e;
6539 
6540  str_modify_keep_cr(str);
6541  enc = STR_ENC_GET(str);
6543  s = RSTRING_PTR(str);
6544  if (!s || RSTRING_LEN(str) == 0) return Qnil;
6545  t = e = RSTRING_END(str);
6546 
6547  /* remove trailing spaces or '\0's */
6548  if (single_byte_optimizable(str)) {
6549  unsigned char c;
6550  while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
6551  }
6552  else {
6553  char *tp;
6554 
6555  while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
6556  unsigned int c = rb_enc_codepoint(tp, e, enc);
6557  if (c && !rb_isspace(c)) break;
6558  t = tp;
6559  }
6560  }
6561  if (t < e) {
6562  long len = t-RSTRING_PTR(str);
6563 
6564  STR_SET_LEN(str, len);
6565  RSTRING_PTR(str)[len] = '\0';
6566  return str;
6567  }
6568  return Qnil;
6569 }
6570 
6571 
6572 /*
6573  * call-seq:
6574  * str.rstrip -> new_str
6575  *
6576  * Returns a copy of <i>str</i> with trailing whitespace removed. See also
6577  * <code>String#lstrip</code> and <code>String#strip</code>.
6578  *
6579  * " hello ".rstrip #=> " hello"
6580  * "hello".rstrip #=> "hello"
6581  */
6582 
6583 static VALUE
6585 {
6586  str = rb_str_dup(str);
6587  rb_str_rstrip_bang(str);
6588  return str;
6589 }
6590 
6591 
6592 /*
6593  * call-seq:
6594  * str.strip! -> str or nil
6595  *
6596  * Removes leading and trailing whitespace from <i>str</i>. Returns
6597  * <code>nil</code> if <i>str</i> was not altered.
6598  */
6599 
6600 static VALUE
6602 {
6603  VALUE l = rb_str_lstrip_bang(str);
6604  VALUE r = rb_str_rstrip_bang(str);
6605 
6606  if (NIL_P(l) && NIL_P(r)) return Qnil;
6607  return str;
6608 }
6609 
6610 
6611 /*
6612  * call-seq:
6613  * str.strip -> new_str
6614  *
6615  * Returns a copy of <i>str</i> with leading and trailing whitespace removed.
6616  *
6617  * " hello ".strip #=> "hello"
6618  * "\tgoodbye\r\n".strip #=> "goodbye"
6619  */
6620 
6621 static VALUE
6623 {
6624  str = rb_str_dup(str);
6625  rb_str_strip_bang(str);
6626  return str;
6627 }
6628 
6629 static VALUE
6630 scan_once(VALUE str, VALUE pat, long *start)
6631 {
6632  VALUE result, match;
6633  struct re_registers *regs;
6634  int i;
6635 
6636  if (rb_reg_search(pat, str, *start, 0) >= 0) {
6637  match = rb_backref_get();
6638  regs = RMATCH_REGS(match);
6639  if (BEG(0) == END(0)) {
6640  rb_encoding *enc = STR_ENC_GET(str);
6641  /*
6642  * Always consume at least one character of the input string
6643  */
6644  if (RSTRING_LEN(str) > END(0))
6645  *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
6646  RSTRING_END(str), enc);
6647  else
6648  *start = END(0)+1;
6649  }
6650  else {
6651  *start = END(0);
6652  }
6653  if (regs->num_regs == 1) {
6654  return rb_reg_nth_match(0, match);
6655  }
6656  result = rb_ary_new2(regs->num_regs);
6657  for (i=1; i < regs->num_regs; i++) {
6658  rb_ary_push(result, rb_reg_nth_match(i, match));
6659  }
6660 
6661  return result;
6662  }
6663  return Qnil;
6664 }
6665 
6666 
6667 /*
6668  * call-seq:
6669  * str.scan(pattern) -> array
6670  * str.scan(pattern) {|match, ...| block } -> str
6671  *
6672  * Both forms iterate through <i>str</i>, matching the pattern (which may be a
6673  * <code>Regexp</code> or a <code>String</code>). For each match, a result is
6674  * generated and either added to the result array or passed to the block. If
6675  * the pattern contains no groups, each individual result consists of the
6676  * matched string, <code>$&</code>. If the pattern contains groups, each
6677  * individual result is itself an array containing one entry per group.
6678  *
6679  * a = "cruel world"
6680  * a.scan(/\w+/) #=> ["cruel", "world"]
6681  * a.scan(/.../) #=> ["cru", "el ", "wor"]
6682  * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
6683  * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
6684  *
6685  * And the block form:
6686  *
6687  * a.scan(/\w+/) {|w| print "<<#{w}>> " }
6688  * print "\n"
6689  * a.scan(/(.)(.)/) {|x,y| print y, x }
6690  * print "\n"
6691  *
6692  * <em>produces:</em>
6693  *
6694  * <<cruel>> <<world>>
6695  * rceu lowlr
6696  */
6697 
6698 static VALUE
6700 {
6701  VALUE result;
6702  long start = 0;
6703  long last = -1, prev = 0;
6704  char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
6705 
6706  pat = get_pat(pat, 1);
6707  if (!rb_block_given_p()) {
6708  VALUE ary = rb_ary_new();
6709 
6710  while (!NIL_P(result = scan_once(str, pat, &start))) {
6711  last = prev;
6712  prev = start;
6713  rb_ary_push(ary, result);
6714  }
6715  if (last >= 0) rb_reg_search(pat, str, last, 0);
6716  return ary;
6717  }
6718 
6719  while (!NIL_P(result = scan_once(str, pat, &start))) {
6720  last = prev;
6721  prev = start;
6722  rb_yield(result);
6723  str_mod_check(str, p, len);
6724  }
6725  if (last >= 0) rb_reg_search(pat, str, last, 0);
6726  return str;
6727 }
6728 
6729 
6730 /*
6731  * call-seq:
6732  * str.hex -> integer
6733  *
6734  * Treats leading characters from <i>str</i> as a string of hexadecimal digits
6735  * (with an optional sign and an optional <code>0x</code>) and returns the
6736  * corresponding number. Zero is returned on error.
6737  *
6738  * "0x0a".hex #=> 10
6739  * "-1234".hex #=> -4660
6740  * "0".hex #=> 0
6741  * "wombat".hex #=> 0
6742  */
6743 
6744 static VALUE
6746 {
6747  rb_encoding *enc = rb_enc_get(str);
6748 
6749  if (!rb_enc_asciicompat(enc)) {
6750  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
6751  }
6752  return rb_str_to_inum(str, 16, FALSE);
6753 }
6754 
6755 
6756 /*
6757  * call-seq:
6758  * str.oct -> integer
6759  *
6760  * Treats leading characters of <i>str</i> as a string of octal digits (with an
6761  * optional sign) and returns the corresponding number. Returns 0 if the
6762  * conversion fails.
6763  *
6764  * "123".oct #=> 83
6765  * "-377".oct #=> -255
6766  * "bad".oct #=> 0
6767  * "0377bad".oct #=> 255
6768  */
6769 
6770 static VALUE
6772 {
6773  rb_encoding *enc = rb_enc_get(str);
6774 
6775  if (!rb_enc_asciicompat(enc)) {
6776  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
6777  }
6778  return rb_str_to_inum(str, -8, FALSE);
6779 }
6780 
6781 
6782 /*
6783  * call-seq:
6784  * str.crypt(other_str) -> new_str
6785  *
6786  * Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
6787  * library function <code>crypt</code>. The argument is the salt string, which
6788  * should be two characters long, each character drawn from
6789  * <code>[a-zA-Z0-9./]</code>.
6790  */
6791 
6792 static VALUE
6794 {
6795  extern char *crypt(const char *, const char *);
6796  VALUE result;
6797  const char *s, *saltp;
6798  char *res;
6799 #ifdef BROKEN_CRYPT
6800  char salt_8bit_clean[3];
6801 #endif
6802 
6803  StringValue(salt);
6804  if (RSTRING_LEN(salt) < 2)
6805  rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
6806 
6807  s = RSTRING_PTR(str);
6808  if (!s) s = "";
6809  saltp = RSTRING_PTR(salt);
6810 #ifdef BROKEN_CRYPT
6811  if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
6812  salt_8bit_clean[0] = saltp[0] & 0x7f;
6813  salt_8bit_clean[1] = saltp[1] & 0x7f;
6814  salt_8bit_clean[2] = '\0';
6815  saltp = salt_8bit_clean;
6816  }
6817 #endif
6818  res = crypt(s, saltp);
6819  if (!res) {
6820  rb_sys_fail("crypt");
6821  }
6822  result = rb_str_new2(res);
6823  OBJ_INFECT(result, str);
6824  OBJ_INFECT(result, salt);
6825  return result;
6826 }
6827 
6828 
6829 /*
6830  * call-seq:
6831  * str.intern -> symbol
6832  * str.to_sym -> symbol
6833  *
6834  * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
6835  * symbol if it did not previously exist. See <code>Symbol#id2name</code>.
6836  *
6837  * "Koala".intern #=> :Koala
6838  * s = 'cat'.to_sym #=> :cat
6839  * s == :cat #=> true
6840  * s = '@cat'.to_sym #=> :@cat
6841  * s == :@cat #=> true
6842  *
6843  * This can also be used to create symbols that cannot be represented using the
6844  * <code>:xxx</code> notation.
6845  *
6846  * 'cat and dog'.to_sym #=> :"cat and dog"
6847  */
6848 
6849 VALUE
6851 {
6852  VALUE str = RB_GC_GUARD(s);
6853  ID id;
6854 
6855  id = rb_intern_str(str);
6856  return ID2SYM(id);
6857 }
6858 
6859 
6860 /*
6861  * call-seq:
6862  * str.ord -> integer
6863  *
6864  * Return the <code>Integer</code> ordinal of a one-character string.
6865  *
6866  * "a".ord #=> 97
6867  */
6868 
6869 VALUE
6871 {
6872  unsigned int c;
6873 
6875  return UINT2NUM(c);
6876 }
6877 /*
6878  * call-seq:
6879  * str.sum(n=16) -> integer
6880  *
6881  * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
6882  * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
6883  * to 16. The result is simply the sum of the binary value of each character in
6884  * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
6885  * checksum.
6886  */
6887 
6888 static VALUE
6890 {
6891  VALUE vbits;
6892  int bits;
6893  char *ptr, *p, *pend;
6894  long len;
6895  VALUE sum = INT2FIX(0);
6896  unsigned long sum0 = 0;
6897 
6898  if (argc == 0) {
6899  bits = 16;
6900  }
6901  else {
6902  rb_scan_args(argc, argv, "01", &vbits);
6903  bits = NUM2INT(vbits);
6904  }
6905  ptr = p = RSTRING_PTR(str);
6906  len = RSTRING_LEN(str);
6907  pend = p + len;
6908 
6909  while (p < pend) {
6910  if (FIXNUM_MAX - UCHAR_MAX < sum0) {
6911  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6912  str_mod_check(str, ptr, len);
6913  sum0 = 0;
6914  }
6915  sum0 += (unsigned char)*p;
6916  p++;
6917  }
6918 
6919  if (bits == 0) {
6920  if (sum0) {
6921  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6922  }
6923  }
6924  else {
6925  if (sum == INT2FIX(0)) {
6926  if (bits < (int)sizeof(long)*CHAR_BIT) {
6927  sum0 &= (((unsigned long)1)<<bits)-1;
6928  }
6929  sum = LONG2FIX(sum0);
6930  }
6931  else {
6932  VALUE mod;
6933 
6934  if (sum0) {
6935  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
6936  }
6937 
6938  mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
6939  mod = rb_funcall(mod, '-', 1, INT2FIX(1));
6940  sum = rb_funcall(sum, '&', 1, mod);
6941  }
6942  }
6943  return sum;
6944 }
6945 
6946 static VALUE
6947 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
6948 {
6949  rb_encoding *enc;
6950  VALUE w;
6951  long width, len, flen = 1, fclen = 1;
6952  VALUE res;
6953  char *p;
6954  const char *f = " ";
6955  long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
6956  volatile VALUE pad;
6957  int singlebyte = 1, cr;
6958 
6959  rb_scan_args(argc, argv, "11", &w, &pad);
6960  enc = STR_ENC_GET(str);
6961  width = NUM2LONG(w);
6962  if (argc == 2) {
6963  StringValue(pad);
6964  enc = rb_enc_check(str, pad);
6965  f = RSTRING_PTR(pad);
6966  flen = RSTRING_LEN(pad);
6967  fclen = str_strlen(pad, enc);
6968  singlebyte = single_byte_optimizable(pad);
6969  if (flen == 0 || fclen == 0) {
6970  rb_raise(rb_eArgError, "zero width padding");
6971  }
6972  }
6973  len = str_strlen(str, enc);
6974  if (width < 0 || len >= width) return rb_str_dup(str);
6975  n = width - len;
6976  llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
6977  rlen = n - llen;
6978  cr = ENC_CODERANGE(str);
6979  if (flen > 1) {
6980  llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
6981  rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
6982  }
6983  size = RSTRING_LEN(str);
6984  if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
6985  (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
6986  (len += llen2 + rlen2) >= LONG_MAX - size) {
6987  rb_raise(rb_eArgError, "argument too big");
6988  }
6989  len += size;
6990  res = rb_str_new5(str, 0, len);
6991  p = RSTRING_PTR(res);
6992  if (flen <= 1) {
6993  memset(p, *f, llen);
6994  p += llen;
6995  }
6996  else {
6997  while (llen >= fclen) {
6998  memcpy(p,f,flen);
6999  p += flen;
7000  llen -= fclen;
7001  }
7002  if (llen > 0) {
7003  memcpy(p, f, llen2);
7004  p += llen2;
7005  }
7006  }
7007  memcpy(p, RSTRING_PTR(str), size);
7008  p += size;
7009  if (flen <= 1) {
7010  memset(p, *f, rlen);
7011  p += rlen;
7012  }
7013  else {
7014  while (rlen >= fclen) {
7015  memcpy(p,f,flen);
7016  p += flen;
7017  rlen -= fclen;
7018  }
7019  if (rlen > 0) {
7020  memcpy(p, f, rlen2);
7021  p += rlen2;
7022  }
7023  }
7024  *p = '\0';
7025  STR_SET_LEN(res, p-RSTRING_PTR(res));
7026  OBJ_INFECT(res, str);
7027  if (!NIL_P(pad)) OBJ_INFECT(res, pad);
7028  rb_enc_associate(res, enc);
7029  if (argc == 2)
7030  cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
7031  if (cr != ENC_CODERANGE_BROKEN)
7032  ENC_CODERANGE_SET(res, cr);
7033  return res;
7034 }
7035 
7036 
7037 /*
7038  * call-seq:
7039  * str.ljust(integer, padstr=' ') -> new_str
7040  *
7041  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7042  * <code>String</code> of length <i>integer</i> with <i>str</i> left justified
7043  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7044  *
7045  * "hello".ljust(4) #=> "hello"
7046  * "hello".ljust(20) #=> "hello "
7047  * "hello".ljust(20, '1234') #=> "hello123412341234123"
7048  */
7049 
7050 static VALUE
7052 {
7053  return rb_str_justify(argc, argv, str, 'l');
7054 }
7055 
7056 
7057 /*
7058  * call-seq:
7059  * str.rjust(integer, padstr=' ') -> new_str
7060  *
7061  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7062  * <code>String</code> of length <i>integer</i> with <i>str</i> right justified
7063  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7064  *
7065  * "hello".rjust(4) #=> "hello"
7066  * "hello".rjust(20) #=> " hello"
7067  * "hello".rjust(20, '1234') #=> "123412341234123hello"
7068  */
7069 
7070 static VALUE
7072 {
7073  return rb_str_justify(argc, argv, str, 'r');
7074 }
7075 
7076 
7077 /*
7078  * call-seq:
7079  * str.center(integer, padstr) -> new_str
7080  *
7081  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
7082  * <code>String</code> of length <i>integer</i> with <i>str</i> centered and
7083  * padded with <i>padstr</i>; otherwise, returns <i>str</i>.
7084  *
7085  * "hello".center(4) #=> "hello"
7086  * "hello".center(20) #=> " hello "
7087  * "hello".center(20, '123') #=> "1231231hello12312312"
7088  */
7089 
7090 static VALUE
7092 {
7093  return rb_str_justify(argc, argv, str, 'c');
7094 }
7095 
7096 /*
7097  * call-seq:
7098  * str.partition(sep) -> [head, sep, tail]
7099  * str.partition(regexp) -> [head, match, tail]
7100  *
7101  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
7102  * and returns the part before it, the match, and the part
7103  * after it.
7104  * If it is not found, returns two empty strings and <i>str</i>.
7105  *
7106  * "hello".partition("l") #=> ["he", "l", "lo"]
7107  * "hello".partition("x") #=> ["hello", "", ""]
7108  * "hello".partition(/.l/) #=> ["h", "el", "lo"]
7109  */
7110 
7111 static VALUE
7113 {
7114  long pos;
7115  int regex = FALSE;
7116 
7117  if (TYPE(sep) == T_REGEXP) {
7118  pos = rb_reg_search(sep, str, 0, 0);
7119  regex = TRUE;
7120  }
7121  else {
7122  VALUE tmp;
7123 
7124  tmp = rb_check_string_type(sep);
7125  if (NIL_P(tmp)) {
7126  rb_raise(rb_eTypeError, "type mismatch: %s given",
7127  rb_obj_classname(sep));
7128  }
7129  sep = tmp;
7130  pos = rb_str_index(str, sep, 0);
7131  }
7132  if (pos < 0) {
7133  failed:
7134  return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
7135  }
7136  if (regex) {
7137  sep = rb_str_subpat(str, sep, INT2FIX(0));
7138  if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
7139  }
7140  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
7141  sep,
7142  rb_str_subseq(str, pos+RSTRING_LEN(sep),
7143  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
7144 }
7145 
7146 /*
7147  * call-seq:
7148  * str.rpartition(sep) -> [head, sep, tail]
7149  * str.rpartition(regexp) -> [head, match, tail]
7150  *
7151  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
7152  * of the string, and returns the part before it, the match, and the part
7153  * after it.
7154  * If it is not found, returns two empty strings and <i>str</i>.
7155  *
7156  * "hello".rpartition("l") #=> ["hel", "l", "o"]
7157  * "hello".rpartition("x") #=> ["", "", "hello"]
7158  * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
7159  */
7160 
7161 static VALUE
7163 {
7164  long pos = RSTRING_LEN(str);
7165  int regex = FALSE;
7166 
7167  if (TYPE(sep) == T_REGEXP) {
7168  pos = rb_reg_search(sep, str, pos, 1);
7169  regex = TRUE;
7170  }
7171  else {
7172  VALUE tmp;
7173 
7174  tmp = rb_check_string_type(sep);
7175  if (NIL_P(tmp)) {
7176  rb_raise(rb_eTypeError, "type mismatch: %s given",
7177  rb_obj_classname(sep));
7178  }
7179  sep = tmp;
7180  pos = rb_str_sublen(str, pos);
7181  pos = rb_str_rindex(str, sep, pos);
7182  }
7183  if (pos < 0) {
7184  return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
7185  }
7186  if (regex) {
7187  sep = rb_reg_nth_match(0, rb_backref_get());
7188  }
7189  return rb_ary_new3(3, rb_str_substr(str, 0, pos),
7190  sep,
7191  rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
7192 }
7193 
7194 /*
7195  * call-seq:
7196  * str.start_with?([prefix]+) -> true or false
7197  *
7198  * Returns true if <i>str</i> starts with one of the prefixes given.
7199  *
7200  * p "hello".start_with?("hell") #=> true
7201  *
7202  * # returns true if one of the prefixes matches.
7203  * p "hello".start_with?("heaven", "hell") #=> true
7204  * p "hello".start_with?("heaven", "paradise") #=> false
7205  *
7206  *
7207  *
7208  */
7209 
7210 static VALUE
7212 {
7213  int i;
7214 
7215  for (i=0; i<argc; i++) {
7216  VALUE tmp = rb_check_string_type(argv[i]);
7217  if (NIL_P(tmp)) continue;
7218  rb_enc_check(str, tmp);
7219  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
7220  if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
7221  return Qtrue;
7222  }
7223  return Qfalse;
7224 }
7225 
7226 /*
7227  * call-seq:
7228  * str.end_with?([suffix]+) -> true or false
7229  *
7230  * Returns true if <i>str</i> ends with one of the suffixes given.
7231  */
7232 
7233 static VALUE
7235 {
7236  int i;
7237  char *p, *s, *e;
7238  rb_encoding *enc;
7239 
7240  for (i=0; i<argc; i++) {
7241  VALUE tmp = rb_check_string_type(argv[i]);
7242  if (NIL_P(tmp)) continue;
7243  enc = rb_enc_check(str, tmp);
7244  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
7245  p = RSTRING_PTR(str);
7246  e = p + RSTRING_LEN(str);
7247  s = e - RSTRING_LEN(tmp);
7248  if (rb_enc_left_char_head(p, s, e, enc) != s)
7249  continue;
7250  if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
7251  return Qtrue;
7252  }
7253  return Qfalse;
7254 }
7255 
7256 void
7257 rb_str_setter(VALUE val, ID id, VALUE *var)
7258 {
7259  if (!NIL_P(val) && TYPE(val) != T_STRING) {
7260  rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
7261  }
7262  *var = val;
7263 }
7264 
7265 
7266 /*
7267  * call-seq:
7268  * str.force_encoding(encoding) -> str
7269  *
7270  * Changes the encoding to +encoding+ and returns self.
7271  */
7272 
7273 static VALUE
7275 {
7276  str_modifiable(str);
7277  rb_enc_associate(str, rb_to_encoding(enc));
7278  ENC_CODERANGE_CLEAR(str);
7279  return str;
7280 }
7281 
7282 /*
7283  * call-seq:
7284  * str.valid_encoding? -> true or false
7285  *
7286  * Returns true for a string which encoded correctly.
7287  *
7288  * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
7289  * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
7290  * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
7291  */
7292 
7293 static VALUE
7295 {
7296  int cr = rb_enc_str_coderange(str);
7297 
7298  return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
7299 }
7300 
7301 /*
7302  * call-seq:
7303  * str.ascii_only? -> true or false
7304  *
7305  * Returns true for a string which has only ASCII characters.
7306  *
7307  * "abc".force_encoding("UTF-8").ascii_only? #=> true
7308  * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
7309  */
7310 
7311 static VALUE
7313 {
7314  int cr = rb_enc_str_coderange(str);
7315 
7316  return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
7317 }
7318 
7333 VALUE
7334 rb_str_ellipsize(VALUE str, long len)
7335 {
7336  static const char ellipsis[] = "...";
7337  const long ellipsislen = sizeof(ellipsis) - 1;
7338  rb_encoding *const enc = rb_enc_get(str);
7339  const long blen = RSTRING_LEN(str);
7340  const char *const p = RSTRING_PTR(str), *e = p + blen;
7341  VALUE estr, ret = 0;
7342 
7343  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
7344  if (len * rb_enc_mbminlen(enc) >= blen ||
7345  (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
7346  ret = str;
7347  }
7348  else if (len <= ellipsislen ||
7349  !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
7350  if (rb_enc_asciicompat(enc)) {
7351  ret = rb_str_new_with_class(str, ellipsis, len);
7352  rb_enc_associate(ret, enc);
7353  }
7354  else {
7355  estr = rb_usascii_str_new(ellipsis, len);
7356  ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
7357  }
7358  }
7359  else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
7360  rb_str_cat(ret, ellipsis, ellipsislen);
7361  }
7362  else {
7363  estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
7364  rb_enc_from_encoding(enc), 0, Qnil);
7365  rb_str_append(ret, estr);
7366  }
7367  return ret;
7368 }
7369 
7370 /**********************************************************************
7371  * Document-class: Symbol
7372  *
7373  * <code>Symbol</code> objects represent names and some strings
7374  * inside the Ruby
7375  * interpreter. They are generated using the <code>:name</code> and
7376  * <code>:"string"</code> literals
7377  * syntax, and by the various <code>to_sym</code> methods. The same
7378  * <code>Symbol</code> object will be created for a given name or string
7379  * for the duration of a program's execution, regardless of the context
7380  * or meaning of that name. Thus if <code>Fred</code> is a constant in
7381  * one context, a method in another, and a class in a third, the
7382  * <code>Symbol</code> <code>:Fred</code> will be the same object in
7383  * all three contexts.
7384  *
7385  * module One
7386  * class Fred
7387  * end
7388  * $f1 = :Fred
7389  * end
7390  * module Two
7391  * Fred = 1
7392  * $f2 = :Fred
7393  * end
7394  * def Fred()
7395  * end
7396  * $f3 = :Fred
7397  * $f1.object_id #=> 2514190
7398  * $f2.object_id #=> 2514190
7399  * $f3.object_id #=> 2514190
7400  *
7401  */
7402 
7403 
7404 /*
7405  * call-seq:
7406  * sym == obj -> true or false
7407  *
7408  * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
7409  * symbol, returns <code>true</code>.
7410  */
7411 
7412 static VALUE
7413 sym_equal(VALUE sym1, VALUE sym2)
7414 {
7415  if (sym1 == sym2) return Qtrue;
7416  return Qfalse;
7417 }
7418 
7419 
7420 static int
7421 sym_printable(const char *s, const char *send, rb_encoding *enc)
7422 {
7423  while (s < send) {
7424  int n;
7425  int c = rb_enc_codepoint_len(s, send, &n, enc);
7426 
7427  if (!rb_enc_isprint(c, enc)) return FALSE;
7428  s += n;
7429  }
7430  return TRUE;
7431 }
7432 
7433 /*
7434  * call-seq:
7435  * sym.inspect -> string
7436  *
7437  * Returns the representation of <i>sym</i> as a symbol literal.
7438  *
7439  * :fred.inspect #=> ":fred"
7440  */
7441 
7442 static VALUE
7444 {
7445  VALUE str;
7446  ID id = SYM2ID(sym);
7447  rb_encoding *enc;
7448  const char *ptr;
7449  long len;
7450  char *dest;
7452 
7453  if (resenc == NULL) resenc = rb_default_external_encoding();
7454  sym = rb_id2str(id);
7455  enc = STR_ENC_GET(sym);
7456  ptr = RSTRING_PTR(sym);
7457  len = RSTRING_LEN(sym);
7458  if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
7459  !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
7460  str = rb_str_inspect(sym);
7461  len = RSTRING_LEN(str);
7462  rb_str_resize(str, len + 1);
7463  dest = RSTRING_PTR(str);
7464  memmove(dest + 1, dest, len);
7465  dest[0] = ':';
7466  }
7467  else {
7468  char *dest;
7469  str = rb_enc_str_new(0, len + 1, enc);
7470  dest = RSTRING_PTR(str);
7471  dest[0] = ':';
7472  memcpy(dest + 1, ptr, len);
7473  }
7474  return str;
7475 }
7476 
7477 
7478 /*
7479  * call-seq:
7480  * sym.id2name -> string
7481  * sym.to_s -> string
7482  *
7483  * Returns the name or string corresponding to <i>sym</i>.
7484  *
7485  * :fred.id2name #=> "fred"
7486  */
7487 
7488 
7489 VALUE
7491 {
7492  ID id = SYM2ID(sym);
7493 
7494  return str_new3(rb_cString, rb_id2str(id));
7495 }
7496 
7497 
7498 /*
7499  * call-seq:
7500  * sym.to_sym -> sym
7501  * sym.intern -> sym
7502  *
7503  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
7504  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
7505  * in this case.
7506  */
7507 
7508 static VALUE
7510 {
7511  return sym;
7512 }
7513 
7514 static VALUE
7516 {
7517  VALUE obj;
7518 
7519  if (argc < 1) {
7520  rb_raise(rb_eArgError, "no receiver given");
7521  }
7522  obj = argv[0];
7523  return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1);
7524 }
7525 
7526 /*
7527  * call-seq:
7528  * sym.to_proc
7529  *
7530  * Returns a _Proc_ object which respond to the given method by _sym_.
7531  *
7532  * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
7533  */
7534 
7535 static VALUE
7537 {
7538  static VALUE sym_proc_cache = Qfalse;
7539  enum {SYM_PROC_CACHE_SIZE = 67};
7540  VALUE proc;
7541  long id, index;
7542  VALUE *aryp;
7543 
7544  if (!sym_proc_cache) {
7545  sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
7546  rb_gc_register_mark_object(sym_proc_cache);
7547  rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
7548  }
7549 
7550  id = SYM2ID(sym);
7551  index = (id % SYM_PROC_CACHE_SIZE) << 1;
7552 
7553  aryp = RARRAY_PTR(sym_proc_cache);
7554  if (aryp[index] == sym) {
7555  return aryp[index + 1];
7556  }
7557  else {
7558  proc = rb_proc_new(sym_call, (VALUE)id);
7559  aryp[index] = sym;
7560  aryp[index + 1] = proc;
7561  return proc;
7562  }
7563 }
7564 
7565 /*
7566  * call-seq:
7567  *
7568  * sym.succ
7569  *
7570  * Same as <code>sym.to_s.succ.intern</code>.
7571  */
7572 
7573 static VALUE
7575 {
7576  return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
7577 }
7578 
7579 /*
7580  * call-seq:
7581  *
7582  * str <=> other -> -1, 0, +1 or nil
7583  *
7584  * Compares _sym_ with _other_ in string form.
7585  */
7586 
7587 static VALUE
7589 {
7590  if (!SYMBOL_P(other)) {
7591  return Qnil;
7592  }
7593  return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
7594 }
7595 
7596 /*
7597  * call-seq:
7598  *
7599  * sym.casecmp(other) -> -1, 0, +1 or nil
7600  *
7601  * Case-insensitive version of <code>Symbol#<=></code>.
7602  */
7603 
7604 static VALUE
7606 {
7607  if (!SYMBOL_P(other)) {
7608  return Qnil;
7609  }
7610  return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
7611 }
7612 
7613 /*
7614  * call-seq:
7615  * sym =~ obj -> fixnum or nil
7616  *
7617  * Returns <code>sym.to_s =~ obj</code>.
7618  */
7619 
7620 static VALUE
7622 {
7623  return rb_str_match(rb_sym_to_s(sym), other);
7624 }
7625 
7626 /*
7627  * call-seq:
7628  * sym[idx] -> char
7629  * sym[b, n] -> char
7630  *
7631  * Returns <code>sym.to_s[]</code>.
7632  */
7633 
7634 static VALUE
7636 {
7637  return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
7638 }
7639 
7640 /*
7641  * call-seq:
7642  * sym.length -> integer
7643  *
7644  * Same as <code>sym.to_s.length</code>.
7645  */
7646 
7647 static VALUE
7649 {
7650  return rb_str_length(rb_id2str(SYM2ID(sym)));
7651 }
7652 
7653 /*
7654  * call-seq:
7655  * sym.empty? -> true or false
7656  *
7657  * Returns that _sym_ is :"" or not.
7658  */
7659 
7660 static VALUE
7662 {
7663  return rb_str_empty(rb_id2str(SYM2ID(sym)));
7664 }
7665 
7666 /*
7667  * call-seq:
7668  * sym.upcase -> symbol
7669  *
7670  * Same as <code>sym.to_s.upcase.intern</code>.
7671  */
7672 
7673 static VALUE
7675 {
7677 }
7678 
7679 /*
7680  * call-seq:
7681  * sym.downcase -> symbol
7682  *
7683  * Same as <code>sym.to_s.downcase.intern</code>.
7684  */
7685 
7686 static VALUE
7688 {
7690 }
7691 
7692 /*
7693  * call-seq:
7694  * sym.capitalize -> symbol
7695  *
7696  * Same as <code>sym.to_s.capitalize.intern</code>.
7697  */
7698 
7699 static VALUE
7701 {
7703 }
7704 
7705 /*
7706  * call-seq:
7707  * sym.swapcase -> symbol
7708  *
7709  * Same as <code>sym.to_s.swapcase.intern</code>.
7710  */
7711 
7712 static VALUE
7714 {
7716 }
7717 
7718 /*
7719  * call-seq:
7720  * sym.encoding -> encoding
7721  *
7722  * Returns the Encoding object that represents the encoding of _sym_.
7723  */
7724 
7725 static VALUE
7727 {
7728  return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
7729 }
7730 
7731 ID
7733 {
7734  VALUE tmp;
7735 
7736  switch (TYPE(name)) {
7737  default:
7738  tmp = rb_check_string_type(name);
7739  if (NIL_P(tmp)) {
7740  tmp = rb_inspect(name);
7741  rb_raise(rb_eTypeError, "%s is not a symbol",
7742  RSTRING_PTR(tmp));
7743  }
7744  name = tmp;
7745  /* fall through */
7746  case T_STRING:
7747  name = rb_str_intern(name);
7748  /* fall through */
7749  case T_SYMBOL:
7750  return SYM2ID(name);
7751  }
7752  return Qnil; /* not reached */
7753 }
7754 
7755 /*
7756  * A <code>String</code> object holds and manipulates an arbitrary sequence of
7757  * bytes, typically representing characters. String objects may be created
7758  * using <code>String::new</code> or as literals.
7759  *
7760  * Because of aliasing issues, users of strings should be aware of the methods
7761  * that modify the contents of a <code>String</code> object. Typically,
7762  * methods with names ending in ``!'' modify their receiver, while those
7763  * without a ``!'' return a new <code>String</code>. However, there are
7764  * exceptions, such as <code>String#[]=</code>.
7765  *
7766  */
7767 
7768 void
7770 {
7771 #undef rb_intern
7772 #define rb_intern(str) rb_intern_const(str)
7773 
7774  rb_cString = rb_define_class("String", rb_cObject);
7778  rb_define_method(rb_cString, "initialize", rb_str_init, -1);
7779  rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
7783  rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
7785  rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
7791  rb_define_method(rb_cString, "insert", rb_str_insert, 2);
7792  rb_define_method(rb_cString, "length", rb_str_length, 0);
7794  rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
7795  rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
7802  rb_define_method(rb_cString, "upto", rb_str_upto, -1);
7805  rb_define_method(rb_cString, "replace", rb_str_replace, 1);
7808  rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
7809  rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
7810  rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
7811 
7812  rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
7815  rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
7816  rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
7818 
7819  rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
7820  rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
7821  rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
7822  rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
7823 
7828 
7836  rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
7838  rb_define_method(rb_cString, "concat", rb_str_concat, 1);
7840  rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
7842  rb_define_method(rb_cString, "intern", rb_str_intern, 0);
7843  rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
7845 
7846  rb_define_method(rb_cString, "include?", rb_str_include, 1);
7847  rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
7848  rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
7849 
7851 
7852  rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
7853  rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
7854  rb_define_method(rb_cString, "center", rb_str_center, -1);
7855 
7856  rb_define_method(rb_cString, "sub", rb_str_sub, -1);
7857  rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
7859  rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
7861  rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
7862  rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
7863 
7871 
7874  rb_define_method(rb_cString, "delete", rb_str_delete, -1);
7875  rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
7876  rb_define_method(rb_cString, "count", rb_str_count, -1);
7877 
7882 
7883  rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
7884  rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
7885  rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
7886  rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
7887 
7888  rb_define_method(rb_cString, "sum", rb_str_sum, -1);
7889 
7890  rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
7892 
7893  rb_define_method(rb_cString, "partition", rb_str_partition, 1);
7894  rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
7895 
7896  rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
7897  rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
7898  rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
7900 
7901  id_to_s = rb_intern("to_s");
7902 
7903  rb_fs = Qnil;
7904  rb_define_variable("$;", &rb_fs);
7905  rb_define_variable("$-F", &rb_fs);
7906 
7907  rb_cSymbol = rb_define_class("Symbol", rb_cObject);
7911  rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
7912 
7915  rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
7917  rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
7918  rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
7919  rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
7920  rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
7921  rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
7922  rb_define_method(rb_cSymbol, "next", sym_succ, 0);
7923 
7924  rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
7925  rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
7927 
7928  rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
7929  rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
7930  rb_define_method(rb_cSymbol, "length", sym_length, 0);
7931  rb_define_method(rb_cSymbol, "size", sym_length, 0);
7932  rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
7933  rb_define_method(rb_cSymbol, "match", sym_match, 1);
7934 
7935  rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
7936  rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
7937  rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
7938  rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
7939 
7940  rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
7941 }
7942