• Main Page
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

string.c

Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   string.c -
00004 
00005   $Author: yugui $
00006   created at: Mon Aug  9 17:12:58 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
00010   Copyright (C) 2000  Information-technology Promotion Agency, Japan
00011 
00012 **********************************************************************/
00013 
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include <assert.h>
00018 
00019 #define BEG(no) regs->beg[no]
00020 #define END(no) regs->end[no]
00021 
00022 #include <math.h>
00023 #include <ctype.h>
00024 
00025 #ifdef HAVE_UNISTD_H
00026 #include <unistd.h>
00027 #endif
00028 
00029 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00030 
00031 #undef rb_str_new_cstr
00032 #undef rb_tainted_str_new_cstr
00033 #undef rb_usascii_str_new_cstr
00034 #undef rb_external_str_new_cstr
00035 #undef rb_locale_str_new_cstr
00036 #undef rb_str_new2
00037 #undef rb_str_new3
00038 #undef rb_str_new4
00039 #undef rb_str_new5
00040 #undef rb_tainted_str_new2
00041 #undef rb_usascii_str_new2
00042 #undef rb_str_dup_frozen
00043 #undef rb_str_buf_new_cstr
00044 #undef rb_str_buf_new2
00045 #undef rb_str_buf_cat2
00046 #undef rb_str_cat2
00047 
00048 VALUE rb_cString;
00049 VALUE rb_cSymbol;
00050 
00051 #define RUBY_MAX_CHAR_LEN 16
00052 #define STR_TMPLOCK FL_USER7
00053 #define STR_NOEMBED FL_USER1
00054 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
00055 #define STR_ASSOC   FL_USER3
00056 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
00057 #define STR_ASSOC_P(s)  FL_ALL(s, STR_NOEMBED|STR_ASSOC)
00058 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00059 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
00060 #define STR_UNSET_NOCAPA(s) do {\
00061     if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
00062 } while (0)
00063 
00064 
00065 #define STR_SET_NOEMBED(str) do {\
00066     FL_SET(str, STR_NOEMBED);\
00067     STR_SET_EMBED_LEN(str, 0);\
00068 } while (0)
00069 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
00070 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
00071 #define STR_SET_EMBED_LEN(str, n) do { \
00072     long tmp_n = (n);\
00073     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00074     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00075 } while (0)
00076 
00077 #define STR_SET_LEN(str, n) do { \
00078     if (STR_EMBED_P(str)) {\
00079         STR_SET_EMBED_LEN(str, n);\
00080     }\
00081     else {\
00082         RSTRING(str)->as.heap.len = (n);\
00083     }\
00084 } while (0)
00085 
00086 #define STR_DEC_LEN(str) do {\
00087     if (STR_EMBED_P(str)) {\
00088         long n = RSTRING_LEN(str);\
00089         n--;\
00090         STR_SET_EMBED_LEN(str, n);\
00091     }\
00092     else {\
00093         RSTRING(str)->as.heap.len--;\
00094     }\
00095 } while (0)
00096 
00097 #define RESIZE_CAPA(str,capacity) do {\
00098     if (STR_EMBED_P(str)) {\
00099         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00100             char *tmp = ALLOC_N(char, capacity+1);\
00101             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00102             RSTRING(str)->as.heap.ptr = tmp;\
00103             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00104             STR_SET_NOEMBED(str);\
00105             RSTRING(str)->as.heap.aux.capa = (capacity);\
00106         }\
00107     }\
00108     else {\
00109         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00110         if (!STR_NOCAPA_P(str))\
00111             RSTRING(str)->as.heap.aux.capa = (capacity);\
00112     }\
00113 } while (0)
00114 
00115 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00116 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00117 
00118 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00119 
00120 static inline int
00121 single_byte_optimizable(VALUE str)
00122 {
00123     rb_encoding *enc;
00124 
00125     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
00126     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00127         return 1;
00128 
00129     enc = STR_ENC_GET(str);
00130     if (rb_enc_mbmaxlen(enc) == 1)
00131         return 1;
00132 
00133     /* Conservative.  Possibly single byte.
00134      * "\xa1" in Shift_JIS for example. */
00135     return 0;
00136 }
00137 
00138 VALUE rb_fs;
00139 
00140 static inline const char *
00141 search_nonascii(const char *p, const char *e)
00142 {
00143 #if SIZEOF_VALUE == 8
00144 # define NONASCII_MASK 0x8080808080808080ULL
00145 #elif SIZEOF_VALUE == 4
00146 # define NONASCII_MASK 0x80808080UL
00147 #endif
00148 #ifdef NONASCII_MASK
00149     if ((int)sizeof(VALUE) * 2 < e - p) {
00150         const VALUE *s, *t;
00151         const VALUE lowbits = sizeof(VALUE) - 1;
00152         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00153         while (p < (const char *)s) {
00154             if (!ISASCII(*p))
00155                 return p;
00156             p++;
00157         }
00158         t = (const VALUE*)(~lowbits & (VALUE)e);
00159         while (s < t) {
00160             if (*s & NONASCII_MASK) {
00161                 t = s;
00162                 break;
00163             }
00164             s++;
00165         }
00166         p = (const char *)t;
00167     }
00168 #endif
00169     while (p < e) {
00170         if (!ISASCII(*p))
00171             return p;
00172         p++;
00173     }
00174     return NULL;
00175 }
00176 
00177 static int
00178 coderange_scan(const char *p, long len, rb_encoding *enc)
00179 {
00180     const char *e = p + len;
00181 
00182     if (rb_enc_to_index(enc) == 0) {
00183         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00184         p = search_nonascii(p, e);
00185         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00186     }
00187 
00188     if (rb_enc_asciicompat(enc)) {
00189         p = search_nonascii(p, e);
00190         if (!p) {
00191             return ENC_CODERANGE_7BIT;
00192         }
00193         while (p < e) {
00194             int ret = rb_enc_precise_mbclen(p, e, enc);
00195             if (!MBCLEN_CHARFOUND_P(ret)) {
00196                 return ENC_CODERANGE_BROKEN;
00197             }
00198             p += MBCLEN_CHARFOUND_LEN(ret);
00199             if (p < e) {
00200                 p = search_nonascii(p, e);
00201                 if (!p) {
00202                     return ENC_CODERANGE_VALID;
00203                 }
00204             }
00205         }
00206         if (e < p) {
00207             return ENC_CODERANGE_BROKEN;
00208         }
00209         return ENC_CODERANGE_VALID;
00210     }
00211 
00212     while (p < e) {
00213         int ret = rb_enc_precise_mbclen(p, e, enc);
00214 
00215         if (!MBCLEN_CHARFOUND_P(ret)) {
00216             return ENC_CODERANGE_BROKEN;
00217         }
00218         p += MBCLEN_CHARFOUND_LEN(ret);
00219     }
00220     if (e < p) {
00221         return ENC_CODERANGE_BROKEN;
00222     }
00223     return ENC_CODERANGE_VALID;
00224 }
00225 
00226 long
00227 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00228 {
00229     const char *p = s;
00230 
00231     if (*cr == ENC_CODERANGE_BROKEN)
00232         return e - s;
00233 
00234     if (rb_enc_to_index(enc) == 0) {
00235         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00236         p = search_nonascii(p, e);
00237         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00238         return e - s;
00239     }
00240     else if (rb_enc_asciicompat(enc)) {
00241         p = search_nonascii(p, e);
00242         if (!p) {
00243             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00244             return e - s;
00245         }
00246         while (p < e) {
00247             int ret = rb_enc_precise_mbclen(p, e, enc);
00248             if (!MBCLEN_CHARFOUND_P(ret)) {
00249                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00250                 return p - s;
00251             }
00252             p += MBCLEN_CHARFOUND_LEN(ret);
00253             if (p < e) {
00254                 p = search_nonascii(p, e);
00255                 if (!p) {
00256                     *cr = ENC_CODERANGE_VALID;
00257                     return e - s;
00258                 }
00259             }
00260         }
00261         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00262         return p - s;
00263     }
00264     else {
00265         while (p < e) {
00266             int ret = rb_enc_precise_mbclen(p, e, enc);
00267             if (!MBCLEN_CHARFOUND_P(ret)) {
00268                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00269                 return p - s;
00270             }
00271             p += MBCLEN_CHARFOUND_LEN(ret);
00272         }
00273         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00274         return p - s;
00275     }
00276 }
00277 
00278 static inline void
00279 str_enc_copy(VALUE str1, VALUE str2)
00280 {
00281     rb_enc_set_index(str1, ENCODING_GET(str2));
00282 }
00283 
00284 static void
00285 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00286 {
00287     /* this function is designed for copying encoding and coderange
00288      * from src to new string "dest" which is made from the part of src.
00289      */
00290     str_enc_copy(dest, src);
00291     switch (ENC_CODERANGE(src)) {
00292       case ENC_CODERANGE_7BIT:
00293         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00294         break;
00295       case ENC_CODERANGE_VALID:
00296         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00297             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00298             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00299         else
00300             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00301         break;
00302       default:
00303         if (RSTRING_LEN(dest) == 0) {
00304             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00305                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00306             else
00307                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00308         }
00309         break;
00310     }
00311 }
00312 
00313 static void
00314 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00315 {
00316     str_enc_copy(dest, src);
00317     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00318 }
00319 
00320 int
00321 rb_enc_str_coderange(VALUE str)
00322 {
00323     int cr = ENC_CODERANGE(str);
00324 
00325     if (cr == ENC_CODERANGE_UNKNOWN) {
00326         rb_encoding *enc = STR_ENC_GET(str);
00327         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00328         ENC_CODERANGE_SET(str, cr);
00329     }
00330     return cr;
00331 }
00332 
00333 int
00334 rb_enc_str_asciionly_p(VALUE str)
00335 {
00336     rb_encoding *enc = STR_ENC_GET(str);
00337 
00338     if (!rb_enc_asciicompat(enc))
00339         return FALSE;
00340     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00341         return TRUE;
00342     return FALSE;
00343 }
00344 
00345 static inline void
00346 str_mod_check(VALUE s, const char *p, long len)
00347 {
00348     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00349         rb_raise(rb_eRuntimeError, "string modified");
00350     }
00351 }
00352 
00353 static inline void
00354 str_frozen_check(VALUE s)
00355 {
00356     if (OBJ_FROZEN(s)) {
00357         rb_raise(rb_eRuntimeError, "string frozen");
00358     }
00359 }
00360 
00361 size_t
00362 rb_str_capacity(VALUE str)
00363 {
00364     if (STR_EMBED_P(str)) {
00365         return RSTRING_EMBED_LEN_MAX;
00366     }
00367     else if (STR_NOCAPA_P(str)) {
00368         return RSTRING(str)->as.heap.len;
00369     }
00370     else {
00371         return RSTRING(str)->as.heap.aux.capa;
00372     }
00373 }
00374 
00375 static inline VALUE
00376 str_alloc(VALUE klass)
00377 {
00378     NEWOBJ(str, struct RString);
00379     OBJSETUP(str, klass, T_STRING);
00380 
00381     str->as.heap.ptr = 0;
00382     str->as.heap.len = 0;
00383     str->as.heap.aux.capa = 0;
00384 
00385     return (VALUE)str;
00386 }
00387 
00388 static VALUE
00389 str_new(VALUE klass, const char *ptr, long len)
00390 {
00391     VALUE str;
00392 
00393     if (len < 0) {
00394         rb_raise(rb_eArgError, "negative string size (or size too big)");
00395     }
00396 
00397     str = str_alloc(klass);
00398     if (len > RSTRING_EMBED_LEN_MAX) {
00399         RSTRING(str)->as.heap.aux.capa = len;
00400         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00401         STR_SET_NOEMBED(str);
00402     }
00403     else if (len == 0) {
00404         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00405     }
00406     if (ptr) {
00407         memcpy(RSTRING_PTR(str), ptr, len);
00408     }
00409     STR_SET_LEN(str, len);
00410     RSTRING_PTR(str)[len] = '\0';
00411     return str;
00412 }
00413 
00414 VALUE
00415 rb_str_new(const char *ptr, long len)
00416 {
00417     return str_new(rb_cString, ptr, len);
00418 }
00419 
00420 VALUE
00421 rb_usascii_str_new(const char *ptr, long len)
00422 {
00423     VALUE str = rb_str_new(ptr, len);
00424     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00425     return str;
00426 }
00427 
00428 VALUE
00429 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00430 {
00431     VALUE str = rb_str_new(ptr, len);
00432     rb_enc_associate(str, enc);
00433     return str;
00434 }
00435 
00436 VALUE
00437 rb_str_new_cstr(const char *ptr)
00438 {
00439     if (!ptr) {
00440         rb_raise(rb_eArgError, "NULL pointer given");
00441     }
00442     return rb_str_new(ptr, strlen(ptr));
00443 }
00444 
00445 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00446 #define rb_str_new2 rb_str_new_cstr
00447 
00448 VALUE
00449 rb_usascii_str_new_cstr(const char *ptr)
00450 {
00451     VALUE str = rb_str_new2(ptr);
00452     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00453     return str;
00454 }
00455 
00456 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00457 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00458 
00459 VALUE
00460 rb_tainted_str_new(const char *ptr, long len)
00461 {
00462     VALUE str = rb_str_new(ptr, len);
00463 
00464     OBJ_TAINT(str);
00465     return str;
00466 }
00467 
00468 VALUE
00469 rb_tainted_str_new_cstr(const char *ptr)
00470 {
00471     VALUE str = rb_str_new2(ptr);
00472 
00473     OBJ_TAINT(str);
00474     return str;
00475 }
00476 
00477 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00478 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00479 
00480 VALUE
00481 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00482 {
00483     rb_econv_t *ec;
00484     rb_econv_result_t ret;
00485     long len;
00486     VALUE newstr;
00487     const unsigned char *sp;
00488     unsigned char *dp;
00489 
00490     if (!to) return str;
00491     if (from == to) return str;
00492     if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00493         to == rb_ascii8bit_encoding()) {
00494         if (STR_ENC_GET(str) != to) {
00495             str = rb_str_dup(str);
00496             rb_enc_associate(str, to);
00497         }
00498         return str;
00499     }
00500 
00501     len = RSTRING_LEN(str);
00502     newstr = rb_str_new(0, len);
00503 
00504   retry:
00505     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00506     if (!ec) return str;
00507 
00508     sp = (unsigned char*)RSTRING_PTR(str);
00509     dp = (unsigned char*)RSTRING_PTR(newstr);
00510     ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00511                            &dp, (unsigned char*)RSTRING_END(newstr), 0);
00512     rb_econv_close(ec);
00513     switch (ret) {
00514       case econv_destination_buffer_full:
00515         /* destination buffer short */
00516         len = len < 2 ? 2 : len * 2;
00517         rb_str_resize(newstr, len);
00518         goto retry;
00519 
00520       case econv_finished:
00521         len = dp - (unsigned char*)RSTRING_PTR(newstr);
00522         rb_str_set_len(newstr, len);
00523         rb_enc_associate(newstr, to);
00524         return newstr;
00525 
00526       default:
00527         /* some error, return original */
00528         return str;
00529     }
00530 }
00531 
00532 VALUE
00533 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00534 {
00535     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00536 }
00537 
00538 VALUE
00539 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00540 {
00541     VALUE str;
00542 
00543     str = rb_tainted_str_new(ptr, len);
00544     if (eenc == rb_usascii_encoding() &&
00545         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00546         rb_enc_associate(str, rb_ascii8bit_encoding());
00547         return str;
00548     }
00549     rb_enc_associate(str, eenc);
00550     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00551 }
00552 
00553 VALUE
00554 rb_external_str_new(const char *ptr, long len)
00555 {
00556     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00557 }
00558 
00559 VALUE
00560 rb_external_str_new_cstr(const char *ptr)
00561 {
00562     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00563 }
00564 
00565 VALUE
00566 rb_locale_str_new(const char *ptr, long len)
00567 {
00568     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00569 }
00570 
00571 VALUE
00572 rb_locale_str_new_cstr(const char *ptr)
00573 {
00574     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00575 }
00576 
00577 VALUE
00578 rb_filesystem_str_new(const char *ptr, long len)
00579 {
00580     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00581 }
00582 
00583 VALUE
00584 rb_filesystem_str_new_cstr(const char *ptr)
00585 {
00586     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00587 }
00588 
00589 VALUE
00590 rb_str_export(VALUE str)
00591 {
00592     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00593 }
00594 
00595 VALUE
00596 rb_str_export_locale(VALUE str)
00597 {
00598     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00599 }
00600 
00601 VALUE
00602 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00603 {
00604     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00605 }
00606 
00607 static VALUE
00608 str_replace_shared(VALUE str2, VALUE str)
00609 {
00610     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00611         STR_SET_EMBED(str2);
00612         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00613         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00614     }
00615     else {
00616         str = rb_str_new_frozen(str);
00617         FL_SET(str2, STR_NOEMBED);
00618         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00619         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00620         RSTRING(str2)->as.heap.aux.shared = str;
00621         FL_SET(str2, ELTS_SHARED);
00622     }
00623     rb_enc_cr_str_exact_copy(str2, str);
00624 
00625     return str2;
00626 }
00627 
00628 static VALUE
00629 str_new_shared(VALUE klass, VALUE str)
00630 {
00631     return str_replace_shared(str_alloc(klass), str);
00632 }
00633 
00634 static VALUE
00635 str_new3(VALUE klass, VALUE str)
00636 {
00637     return str_new_shared(klass, str);
00638 }
00639 
00640 VALUE
00641 rb_str_new_shared(VALUE str)
00642 {
00643     VALUE str2 = str_new3(rb_obj_class(str), str);
00644 
00645     OBJ_INFECT(str2, str);
00646     return str2;
00647 }
00648 
00649 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00650 #define rb_str_new3 rb_str_new_shared
00651 
00652 static VALUE
00653 str_new4(VALUE klass, VALUE str)
00654 {
00655     VALUE str2;
00656 
00657     str2 = str_alloc(klass);
00658     STR_SET_NOEMBED(str2);
00659     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00660     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00661     if (STR_SHARED_P(str)) {
00662         VALUE shared = RSTRING(str)->as.heap.aux.shared;
00663         assert(OBJ_FROZEN(shared));
00664         FL_SET(str2, ELTS_SHARED);
00665         RSTRING(str2)->as.heap.aux.shared = shared;
00666     }
00667     else {
00668         FL_SET(str, ELTS_SHARED);
00669         RSTRING(str)->as.heap.aux.shared = str2;
00670     }
00671     rb_enc_cr_str_exact_copy(str2, str);
00672     OBJ_INFECT(str2, str);
00673     return str2;
00674 }
00675 
00676 VALUE
00677 rb_str_new_frozen(VALUE orig)
00678 {
00679     VALUE klass, str;
00680 
00681     if (OBJ_FROZEN(orig)) return orig;
00682     klass = rb_obj_class(orig);
00683     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00684         long ofs;
00685         assert(OBJ_FROZEN(str));
00686         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00687         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00688             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00689             ENCODING_GET(str) != ENCODING_GET(orig)) {
00690             str = str_new3(klass, str);
00691             RSTRING(str)->as.heap.ptr += ofs;
00692             RSTRING(str)->as.heap.len -= ofs;
00693             rb_enc_cr_str_exact_copy(str, orig);
00694             OBJ_INFECT(str, orig);
00695         }
00696     }
00697     else if (STR_EMBED_P(orig)) {
00698         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00699         rb_enc_cr_str_exact_copy(str, orig);
00700         OBJ_INFECT(str, orig);
00701     }
00702     else if (STR_ASSOC_P(orig)) {
00703         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00704         FL_UNSET(orig, STR_ASSOC);
00705         str = str_new4(klass, orig);
00706         FL_SET(str, STR_ASSOC);
00707         RSTRING(str)->as.heap.aux.shared = assoc;
00708     }
00709     else {
00710         str = str_new4(klass, orig);
00711     }
00712     OBJ_FREEZE(str);
00713     return str;
00714 }
00715 
00716 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00717 #define rb_str_new4 rb_str_new_frozen
00718 
00719 VALUE
00720 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00721 {
00722     return str_new(rb_obj_class(obj), ptr, len);
00723 }
00724 
00725 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00726            rb_str_new_with_class, (obj, ptr, len))
00727 #define rb_str_new5 rb_str_new_with_class
00728 
00729 static VALUE
00730 str_new_empty(VALUE str)
00731 {
00732     VALUE v = rb_str_new5(str, 0, 0);
00733     OBJ_INFECT(v, str);
00734     return v;
00735 }
00736 
00737 #define STR_BUF_MIN_SIZE 128
00738 
00739 VALUE
00740 rb_str_buf_new(long capa)
00741 {
00742     VALUE str = str_alloc(rb_cString);
00743 
00744     if (capa < STR_BUF_MIN_SIZE) {
00745         capa = STR_BUF_MIN_SIZE;
00746     }
00747     FL_SET(str, STR_NOEMBED);
00748     RSTRING(str)->as.heap.aux.capa = capa;
00749     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00750     RSTRING(str)->as.heap.ptr[0] = '\0';
00751 
00752     return str;
00753 }
00754 
00755 VALUE
00756 rb_str_buf_new_cstr(const char *ptr)
00757 {
00758     VALUE str;
00759     long len = strlen(ptr);
00760 
00761     str = rb_str_buf_new(len);
00762     rb_str_buf_cat(str, ptr, len);
00763 
00764     return str;
00765 }
00766 
00767 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00768 #define rb_str_buf_new2 rb_str_buf_new_cstr
00769 
00770 VALUE
00771 rb_str_tmp_new(long len)
00772 {
00773     return str_new(0, 0, len);
00774 }
00775 
00776 void
00777 rb_str_free(VALUE str)
00778 {
00779     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00780         xfree(RSTRING(str)->as.heap.ptr);
00781     }
00782 }
00783 
00784 size_t
00785 rb_str_memsize(VALUE str)
00786 {
00787     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00788         return RSTRING(str)->as.heap.aux.capa;
00789     }
00790     else {
00791         return 0;
00792     }
00793 }
00794 
00795 VALUE
00796 rb_str_to_str(VALUE str)
00797 {
00798     return rb_convert_type(str, T_STRING, "String", "to_str");
00799 }
00800 
00801 static inline void str_discard(VALUE str);
00802 
00803 void
00804 rb_str_shared_replace(VALUE str, VALUE str2)
00805 {
00806     rb_encoding *enc;
00807     int cr;
00808     if (str == str2) return;
00809     enc = STR_ENC_GET(str2);
00810     cr = ENC_CODERANGE(str2);
00811     str_discard(str);
00812     OBJ_INFECT(str, str2);
00813     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00814         STR_SET_EMBED(str);
00815         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00816         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00817         rb_enc_associate(str, enc);
00818         ENC_CODERANGE_SET(str, cr);
00819         return;
00820     }
00821     STR_SET_NOEMBED(str);
00822     STR_UNSET_NOCAPA(str);
00823     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00824     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00825     if (STR_NOCAPA_P(str2)) {
00826         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00827         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00828     }
00829     else {
00830         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00831     }
00832     STR_SET_EMBED(str2);        /* abandon str2 */
00833     RSTRING_PTR(str2)[0] = 0;
00834     STR_SET_EMBED_LEN(str2, 0);
00835     rb_enc_associate(str, enc);
00836     ENC_CODERANGE_SET(str, cr);
00837 }
00838 
00839 static ID id_to_s;
00840 
00841 VALUE
00842 rb_obj_as_string(VALUE obj)
00843 {
00844     VALUE str;
00845 
00846     if (TYPE(obj) == T_STRING) {
00847         return obj;
00848     }
00849     str = rb_funcall(obj, id_to_s, 0);
00850     if (TYPE(str) != T_STRING)
00851         return rb_any_to_s(obj);
00852     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00853     return str;
00854 }
00855 
00856 static VALUE
00857 str_replace(VALUE str, VALUE str2)
00858 {
00859     long len;
00860 
00861     len = RSTRING_LEN(str2);
00862     if (STR_ASSOC_P(str2)) {
00863         str2 = rb_str_new4(str2);
00864     }
00865     if (STR_SHARED_P(str2)) {
00866         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00867         assert(OBJ_FROZEN(shared));
00868         STR_SET_NOEMBED(str);
00869         RSTRING(str)->as.heap.len = len;
00870         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00871         FL_SET(str, ELTS_SHARED);
00872         FL_UNSET(str, STR_ASSOC);
00873         RSTRING(str)->as.heap.aux.shared = shared;
00874     }
00875     else {
00876         str_replace_shared(str, str2);
00877     }
00878 
00879     OBJ_INFECT(str, str2);
00880     rb_enc_cr_str_exact_copy(str, str2);
00881     return str;
00882 }
00883 
00884 static VALUE
00885 str_duplicate(VALUE klass, VALUE str)
00886 {
00887     VALUE dup = str_alloc(klass);
00888     str_replace(dup, str);
00889     return dup;
00890 }
00891 
00892 VALUE
00893 rb_str_dup(VALUE str)
00894 {
00895     return str_duplicate(rb_obj_class(str), str);
00896 }
00897 
00898 VALUE
00899 rb_str_resurrect(VALUE str)
00900 {
00901     return str_replace(str_alloc(rb_cString), str);
00902 }
00903 
00904 /*
00905  *  call-seq:
00906  *     String.new(str="")   -> new_str
00907  *
00908  *  Returns a new string object containing a copy of <i>str</i>.
00909  */
00910 
00911 static VALUE
00912 rb_str_init(int argc, VALUE *argv, VALUE str)
00913 {
00914     VALUE orig;
00915 
00916     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00917         rb_str_replace(str, orig);
00918     return str;
00919 }
00920 
00921 static inline long
00922 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00923 {
00924     long c;
00925     const char *q;
00926 
00927     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00928         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00929     }
00930     else if (rb_enc_asciicompat(enc)) {
00931         c = 0;
00932         if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00933             while (p < e) {
00934                 if (ISASCII(*p)) {
00935                     q = search_nonascii(p, e);
00936                     if (!q)
00937                         return c + (e - p);
00938                     c += q - p;
00939                     p = q;
00940                 }
00941                 p += rb_enc_fast_mbclen(p, e, enc);
00942                 c++;
00943             }
00944         }
00945         else {
00946             while (p < e) {
00947                 if (ISASCII(*p)) {
00948                     q = search_nonascii(p, e);
00949                     if (!q)
00950                         return c + (e - p);
00951                     c += q - p;
00952                     p = q;
00953                 }
00954                 p += rb_enc_mbclen(p, e, enc);
00955                 c++;
00956             }
00957         }
00958         return c;
00959     }
00960 
00961     for (c=0; p<e; c++) {
00962         p += rb_enc_mbclen(p, e, enc);
00963     }
00964     return c;
00965 }
00966 
00967 long
00968 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00969 {
00970     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00971 }
00972 
00973 long
00974 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00975 {
00976     long c;
00977     const char *q;
00978     int ret;
00979 
00980     *cr = 0;
00981     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00982         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00983     }
00984     else if (rb_enc_asciicompat(enc)) {
00985         c = 0;
00986         while (p < e) {
00987             if (ISASCII(*p)) {
00988                 q = search_nonascii(p, e);
00989                 if (!q) {
00990                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
00991                     return c + (e - p);
00992                 }
00993                 c += q - p;
00994                 p = q;
00995             }
00996             ret = rb_enc_precise_mbclen(p, e, enc);
00997             if (MBCLEN_CHARFOUND_P(ret)) {
00998                 *cr |= ENC_CODERANGE_VALID;
00999                 p += MBCLEN_CHARFOUND_LEN(ret);
01000             }
01001             else {
01002                 *cr = ENC_CODERANGE_BROKEN;
01003                 p++;
01004             }
01005             c++;
01006         }
01007         if (!*cr) *cr = ENC_CODERANGE_7BIT;
01008         return c;
01009     }
01010 
01011     for (c=0; p<e; c++) {
01012         ret = rb_enc_precise_mbclen(p, e, enc);
01013         if (MBCLEN_CHARFOUND_P(ret)) {
01014             *cr |= ENC_CODERANGE_VALID;
01015             p += MBCLEN_CHARFOUND_LEN(ret);
01016         }
01017         else {
01018             *cr = ENC_CODERANGE_BROKEN;
01019             if (p + rb_enc_mbminlen(enc) <= e)
01020                 p += rb_enc_mbminlen(enc);
01021             else
01022                 p = e;
01023         }
01024     }
01025     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01026     return c;
01027 }
01028 
01029 #ifdef NONASCII_MASK
01030 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01031 static inline VALUE
01032 count_utf8_lead_bytes_with_word(const VALUE *s)
01033 {
01034     VALUE d = *s;
01035     d |= ~(d>>1);
01036     d >>= 6;
01037     d &= NONASCII_MASK >> 7;
01038     d += (d>>8);
01039     d += (d>>16);
01040 #if SIZEOF_VALUE == 8
01041     d += (d>>32);
01042 #endif
01043     return (d&0xF);
01044 }
01045 #endif
01046 
01047 static long
01048 str_strlen(VALUE str, rb_encoding *enc)
01049 {
01050     const char *p, *e;
01051     long n;
01052     int cr;
01053 
01054     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01055     if (!enc) enc = STR_ENC_GET(str);
01056     p = RSTRING_PTR(str);
01057     e = RSTRING_END(str);
01058     cr = ENC_CODERANGE(str);
01059 #ifdef NONASCII_MASK
01060     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01061         enc == rb_utf8_encoding()) {
01062 
01063         VALUE len = 0;
01064         if ((int)sizeof(VALUE) * 2 < e - p) {
01065             const VALUE *s, *t;
01066             const VALUE lowbits = sizeof(VALUE) - 1;
01067             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01068             t = (const VALUE*)(~lowbits & (VALUE)e);
01069             while (p < (const char *)s) {
01070                 if (is_utf8_lead_byte(*p)) len++;
01071                 p++;
01072             }
01073             while (s < t) {
01074                 len += count_utf8_lead_bytes_with_word(s);
01075                 s++;
01076             }
01077             p = (const char *)s;
01078         }
01079         while (p < e) {
01080             if (is_utf8_lead_byte(*p)) len++;
01081             p++;
01082         }
01083         return (long)len;
01084     }
01085 #endif
01086     n = rb_enc_strlen_cr(p, e, enc, &cr);
01087     if (cr) {
01088         ENC_CODERANGE_SET(str, cr);
01089     }
01090     return n;
01091 }
01092 
01093 long
01094 rb_str_strlen(VALUE str)
01095 {
01096     return str_strlen(str, STR_ENC_GET(str));
01097 }
01098 
01099 /*
01100  *  call-seq:
01101  *     str.length   -> integer
01102  *     str.size     -> integer
01103  *
01104  *  Returns the character length of <i>str</i>.
01105  */
01106 
01107 VALUE
01108 rb_str_length(VALUE str)
01109 {
01110     long len;
01111 
01112     len = str_strlen(str, STR_ENC_GET(str));
01113     return LONG2NUM(len);
01114 }
01115 
01116 /*
01117  *  call-seq:
01118  *     str.bytesize  -> integer
01119  *
01120  *  Returns the length of <i>str</i> in bytes.
01121  */
01122 
01123 static VALUE
01124 rb_str_bytesize(VALUE str)
01125 {
01126     return INT2NUM(RSTRING_LEN(str));
01127 }
01128 
01129 /*
01130  *  call-seq:
01131  *     str.empty?   -> true or false
01132  *
01133  *  Returns <code>true</code> if <i>str</i> has a length of zero.
01134  *
01135  *     "hello".empty?   #=> false
01136  *     "".empty?        #=> true
01137  */
01138 
01139 static VALUE
01140 rb_str_empty(VALUE str)
01141 {
01142     if (RSTRING_LEN(str) == 0)
01143         return Qtrue;
01144     return Qfalse;
01145 }
01146 
01147 /*
01148  *  call-seq:
01149  *     str + other_str   -> new_str
01150  *
01151  *  Concatenation---Returns a new <code>String</code> containing
01152  *  <i>other_str</i> concatenated to <i>str</i>.
01153  *
01154  *     "Hello from " + self.to_s   #=> "Hello from main"
01155  */
01156 
01157 VALUE
01158 rb_str_plus(VALUE str1, VALUE str2)
01159 {
01160     VALUE str3;
01161     rb_encoding *enc;
01162 
01163     StringValue(str2);
01164     enc = rb_enc_check(str1, str2);
01165     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01166     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01167     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01168            RSTRING_PTR(str2), RSTRING_LEN(str2));
01169     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01170 
01171     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01172         OBJ_TAINT(str3);
01173     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01174                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01175     return str3;
01176 }
01177 
01178 /*
01179  *  call-seq:
01180  *     str * integer   -> new_str
01181  *
01182  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
01183  *  the receiver.
01184  *
01185  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
01186  */
01187 
01188 VALUE
01189 rb_str_times(VALUE str, VALUE times)
01190 {
01191     VALUE str2;
01192     long n, len;
01193     char *ptr2;
01194 
01195     len = NUM2LONG(times);
01196     if (len < 0) {
01197         rb_raise(rb_eArgError, "negative argument");
01198     }
01199     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
01200         rb_raise(rb_eArgError, "argument too big");
01201     }
01202 
01203     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01204     ptr2 = RSTRING_PTR(str2);
01205     if (len) {
01206         n = RSTRING_LEN(str);
01207         memcpy(ptr2, RSTRING_PTR(str), n);
01208         while (n <= len/2) {
01209             memcpy(ptr2 + n, ptr2, n);
01210             n *= 2;
01211         }
01212         memcpy(ptr2 + n, ptr2, len-n);
01213     }
01214     ptr2[RSTRING_LEN(str2)] = '\0';
01215     OBJ_INFECT(str2, str);
01216     rb_enc_cr_str_copy_for_substr(str2, str);
01217 
01218     return str2;
01219 }
01220 
01221 /*
01222  *  call-seq:
01223  *     str % arg   -> new_str
01224  *
01225  *  Format---Uses <i>str</i> as a format specification, and returns the result
01226  *  of applying it to <i>arg</i>. If the format specification contains more than
01227  *  one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
01228  *  containing the values to be substituted. See <code>Kernel::sprintf</code> for
01229  *  details of the format string.
01230  *
01231  *     "%05d" % 123                              #=> "00123"
01232  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
01233  *     "foo = %{foo}" % { :foo => 'bar' }        #=> "foo = bar"
01234  */
01235 
01236 static VALUE
01237 rb_str_format_m(VALUE str, VALUE arg)
01238 {
01239     volatile VALUE tmp = rb_check_array_type(arg);
01240 
01241     if (!NIL_P(tmp)) {
01242         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01243     }
01244     return rb_str_format(1, &arg, str);
01245 }
01246 
01247 static inline void
01248 str_modifiable(VALUE str)
01249 {
01250     if (FL_TEST(str, STR_TMPLOCK)) {
01251         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01252     }
01253     if (OBJ_FROZEN(str)) rb_error_frozen("string");
01254     if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01255         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01256 }
01257 
01258 static inline int
01259 str_independent(VALUE str)
01260 {
01261     str_modifiable(str);
01262     if (!STR_SHARED_P(str)) return 1;
01263     if (STR_EMBED_P(str)) return 1;
01264     return 0;
01265 }
01266 
01267 static void
01268 str_make_independent(VALUE str)
01269 {
01270     char *ptr;
01271     long len = RSTRING_LEN(str);
01272 
01273     ptr = ALLOC_N(char, len+1);
01274     if (RSTRING_PTR(str)) {
01275         memcpy(ptr, RSTRING_PTR(str), len);
01276     }
01277     STR_SET_NOEMBED(str);
01278     ptr[len] = 0;
01279     RSTRING(str)->as.heap.ptr = ptr;
01280     RSTRING(str)->as.heap.len = len;
01281     RSTRING(str)->as.heap.aux.capa = len;
01282     STR_UNSET_NOCAPA(str);
01283 }
01284 
01285 void
01286 rb_str_modify(VALUE str)
01287 {
01288     if (!str_independent(str))
01289         str_make_independent(str);
01290     ENC_CODERANGE_CLEAR(str);
01291 }
01292 
01293 /* As rb_str_modify(), but don't clear coderange */
01294 static void
01295 str_modify_keep_cr(VALUE str)
01296 {
01297     if (!str_independent(str))
01298         str_make_independent(str);
01299     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01300         /* Force re-scan later */
01301         ENC_CODERANGE_CLEAR(str);
01302 }
01303 
01304 static inline void
01305 str_discard(VALUE str)
01306 {
01307     str_modifiable(str);
01308     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01309         xfree(RSTRING_PTR(str));
01310         RSTRING(str)->as.heap.ptr = 0;
01311         RSTRING(str)->as.heap.len = 0;
01312     }
01313 }
01314 
01315 void
01316 rb_str_associate(VALUE str, VALUE add)
01317 {
01318     /* sanity check */
01319     if (OBJ_FROZEN(str)) rb_error_frozen("string");
01320     if (STR_ASSOC_P(str)) {
01321         /* already associated */
01322         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01323     }
01324     else {
01325         if (STR_SHARED_P(str)) {
01326             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01327             str_make_independent(str);
01328             if (STR_ASSOC_P(assoc)) {
01329                 assoc = RSTRING(assoc)->as.heap.aux.shared;
01330                 rb_ary_concat(assoc, add);
01331                 add = assoc;
01332             }
01333         }
01334         else if (STR_EMBED_P(str)) {
01335             str_make_independent(str);
01336         }
01337         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01338             RESIZE_CAPA(str, RSTRING_LEN(str));
01339         }
01340         FL_SET(str, STR_ASSOC);
01341         RBASIC(add)->klass = 0;
01342         RSTRING(str)->as.heap.aux.shared = add;
01343     }
01344 }
01345 
01346 VALUE
01347 rb_str_associated(VALUE str)
01348 {
01349     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01350     if (STR_ASSOC_P(str)) {
01351         return RSTRING(str)->as.heap.aux.shared;
01352     }
01353     return Qfalse;
01354 }
01355 
01356 VALUE
01357 rb_string_value(volatile VALUE *ptr)
01358 {
01359     VALUE s = *ptr;
01360     if (TYPE(s) != T_STRING) {
01361         s = rb_str_to_str(s);
01362         *ptr = s;
01363     }
01364     return s;
01365 }
01366 
01367 char *
01368 rb_string_value_ptr(volatile VALUE *ptr)
01369 {
01370     VALUE str = rb_string_value(ptr);
01371     return RSTRING_PTR(str);
01372 }
01373 
01374 char *
01375 rb_string_value_cstr(volatile VALUE *ptr)
01376 {
01377     VALUE str = rb_string_value(ptr);
01378     char *s = RSTRING_PTR(str);
01379     long len = RSTRING_LEN(str);
01380 
01381     if (!s || memchr(s, 0, len)) {
01382         rb_raise(rb_eArgError, "string contains null byte");
01383     }
01384     if (s[len]) {
01385         rb_str_modify(str);
01386         s = RSTRING_PTR(str);
01387         s[RSTRING_LEN(str)] = 0;
01388     }
01389     return s;
01390 }
01391 
01392 VALUE
01393 rb_check_string_type(VALUE str)
01394 {
01395     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01396     return str;
01397 }
01398 
01399 /*
01400  *  call-seq:
01401  *     String.try_convert(obj) -> string or nil
01402  *
01403  *  Try to convert <i>obj</i> into a String, using to_str method.
01404  *  Returns converted string or nil if <i>obj</i> cannot be converted
01405  *  for any reason.
01406  *
01407  *     String.try_convert("str")     #=> "str"
01408  *     String.try_convert(/re/)      #=> nil
01409  */
01410 static VALUE
01411 rb_str_s_try_convert(VALUE dummy, VALUE str)
01412 {
01413     return rb_check_string_type(str);
01414 }
01415 
01416 char*
01417 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01418 {
01419     if (rb_enc_mbmaxlen(enc) == 1) {
01420         p += nth;
01421     }
01422     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01423         p += nth * rb_enc_mbmaxlen(enc);
01424     }
01425     else if (rb_enc_asciicompat(enc)) {
01426         const char *p2, *e2;
01427         int n;
01428 
01429         while (p < e && 0 < nth) {
01430             e2 = p + nth;
01431             if (e < e2)
01432                 return (char *)e;
01433             if (ISASCII(*p)) {
01434                 p2 = search_nonascii(p, e2);
01435                 if (!p2)
01436                     return (char *)e2;
01437                 nth -= p2 - p;
01438                 p = p2;
01439             }
01440             n = rb_enc_mbclen(p, e, enc);
01441             p += n;
01442             nth--;
01443         }
01444         if (nth != 0)
01445             return (char *)e;
01446         return (char *)p;
01447     }
01448     else {
01449         while (p<e && nth--) {
01450             p += rb_enc_mbclen(p, e, enc);
01451         }
01452     }
01453     if (p > e) p = e;
01454     return (char*)p;
01455 }
01456 
01457 static char*
01458 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01459 {
01460     if (singlebyte)
01461         p += nth;
01462     else {
01463         p = rb_enc_nth(p, e, nth, enc);
01464     }
01465     if (!p) return 0;
01466     if (p > e) p = e;
01467     return (char *)p;
01468 }
01469 
01470 /* char offset to byte offset */
01471 static long
01472 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01473 {
01474     const char *pp = str_nth(p, e, nth, enc, singlebyte);
01475     if (!pp) return e - p;
01476     return pp - p;
01477 }
01478 
01479 long
01480 rb_str_offset(VALUE str, long pos)
01481 {
01482     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01483                       STR_ENC_GET(str), single_byte_optimizable(str));
01484 }
01485 
01486 #ifdef NONASCII_MASK
01487 static char *
01488 str_utf8_nth(const char *p, const char *e, long nth)
01489 {
01490     if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01491         const VALUE *s, *t;
01492         const VALUE lowbits = sizeof(VALUE) - 1;
01493         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01494         t = (const VALUE*)(~lowbits & (VALUE)e);
01495         while (p < (const char *)s) {
01496             if (is_utf8_lead_byte(*p)) nth--;
01497             p++;
01498         }
01499         do {
01500             nth -= count_utf8_lead_bytes_with_word(s);
01501             s++;
01502         } while (s < t && (int)sizeof(VALUE) <= nth);
01503         p = (char *)s;
01504     }
01505     while (p < e) {
01506         if (is_utf8_lead_byte(*p)) {
01507             if (nth == 0) break;
01508             nth--;
01509         }
01510         p++;
01511     }
01512     return (char *)p;
01513 }
01514 
01515 static long
01516 str_utf8_offset(const char *p, const char *e, long nth)
01517 {
01518     const char *pp = str_utf8_nth(p, e, nth);
01519     return pp - p;
01520 }
01521 #endif
01522 
01523 /* byte offset to char offset */
01524 long
01525 rb_str_sublen(VALUE str, long pos)
01526 {
01527     if (single_byte_optimizable(str) || pos < 0)
01528         return pos;
01529     else {
01530         char *p = RSTRING_PTR(str);
01531         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01532     }
01533 }
01534 
01535 VALUE
01536 rb_str_subseq(VALUE str, long beg, long len)
01537 {
01538     VALUE str2;
01539 
01540     if (RSTRING_LEN(str) == beg + len &&
01541         RSTRING_EMBED_LEN_MAX < len) {
01542         str2 = rb_str_new_shared(rb_str_new_frozen(str));
01543         rb_str_drop_bytes(str2, beg);
01544     }
01545     else {
01546         str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01547     }
01548 
01549     rb_enc_cr_str_copy_for_substr(str2, str);
01550     OBJ_INFECT(str2, str);
01551 
01552     return str2;
01553 }
01554 
01555 VALUE
01556 rb_str_substr(VALUE str, long beg, long len)
01557 {
01558     rb_encoding *enc = STR_ENC_GET(str);
01559     VALUE str2;
01560     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01561 
01562     if (len < 0) return Qnil;
01563     if (!RSTRING_LEN(str)) {
01564         len = 0;
01565     }
01566     if (single_byte_optimizable(str)) {
01567         if (beg > RSTRING_LEN(str)) return Qnil;
01568         if (beg < 0) {
01569             beg += RSTRING_LEN(str);
01570             if (beg < 0) return Qnil;
01571         }
01572         if (beg + len > RSTRING_LEN(str))
01573             len = RSTRING_LEN(str) - beg;
01574         if (len <= 0) {
01575             len = 0;
01576             p = 0;
01577         }
01578         else
01579             p = s + beg;
01580         goto sub;
01581     }
01582     if (beg < 0) {
01583         if (len > -beg) len = -beg;
01584         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01585             beg = -beg;
01586             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01587             p = e;
01588             if (!p) return Qnil;
01589             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01590             if (!p) return Qnil;
01591             len = e - p;
01592             goto sub;
01593         }
01594         else {
01595             beg += str_strlen(str, enc);
01596             if (beg < 0) return Qnil;
01597         }
01598     }
01599     else if (beg > 0 && beg > str_strlen(str, enc)) {
01600         return Qnil;
01601     }
01602     if (len == 0) {
01603         p = 0;
01604     }
01605 #ifdef NONASCII_MASK
01606     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01607         enc == rb_utf8_encoding()) {
01608         p = str_utf8_nth(s, e, beg);
01609         len = str_utf8_offset(p, e, len);
01610     }
01611 #endif
01612     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01613         int char_sz = rb_enc_mbmaxlen(enc);
01614 
01615         p = s + beg * char_sz;
01616         if (p > e) {
01617             p = e;
01618             len = 0;
01619         }
01620         else if (len * char_sz > e - p)
01621             len = e - p;
01622         else
01623             len *= char_sz;
01624     }
01625     else if ((p = str_nth(s, e, beg, enc, 0)) == e) {
01626         len = 0;
01627     }
01628     else {
01629         len = str_offset(p, e, len, enc, 0);
01630     }
01631   sub:
01632     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01633         str2 = rb_str_new4(str);
01634         str2 = str_new3(rb_obj_class(str2), str2);
01635         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01636         RSTRING(str2)->as.heap.len = len;
01637     }
01638     else {
01639         str2 = rb_str_new5(str, p, len);
01640         rb_enc_cr_str_copy_for_substr(str2, str);
01641         OBJ_INFECT(str2, str);
01642     }
01643 
01644     return str2;
01645 }
01646 
01647 VALUE
01648 rb_str_freeze(VALUE str)
01649 {
01650     if (STR_ASSOC_P(str)) {
01651         VALUE ary = RSTRING(str)->as.heap.aux.shared;
01652         OBJ_FREEZE(ary);
01653     }
01654     return rb_obj_freeze(str);
01655 }
01656 
01657 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01658 #define rb_str_dup_frozen rb_str_new_frozen
01659 
01660 VALUE
01661 rb_str_locktmp(VALUE str)
01662 {
01663     if (FL_TEST(str, STR_TMPLOCK)) {
01664         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01665     }
01666     FL_SET(str, STR_TMPLOCK);
01667     return str;
01668 }
01669 
01670 VALUE
01671 rb_str_unlocktmp(VALUE str)
01672 {
01673     if (!FL_TEST(str, STR_TMPLOCK)) {
01674         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01675     }
01676     FL_UNSET(str, STR_TMPLOCK);
01677     return str;
01678 }
01679 
01680 void
01681 rb_str_set_len(VALUE str, long len)
01682 {
01683     rb_str_modify(str);
01684     STR_SET_LEN(str, len);
01685     RSTRING_PTR(str)[len] = '\0';
01686 }
01687 
01688 VALUE
01689 rb_str_resize(VALUE str, long len)
01690 {
01691     long slen;
01692 
01693     if (len < 0) {
01694         rb_raise(rb_eArgError, "negative string size (or size too big)");
01695     }
01696 
01697     rb_str_modify(str);
01698     slen = RSTRING_LEN(str);
01699     if (len != slen) {
01700         if (STR_EMBED_P(str)) {
01701             char *ptr;
01702             if (len <= RSTRING_EMBED_LEN_MAX) {
01703                 STR_SET_EMBED_LEN(str, len);
01704                 RSTRING(str)->as.ary[len] = '\0';
01705                 return str;
01706             }
01707             ptr = ALLOC_N(char,len+1);
01708             MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
01709             RSTRING(str)->as.heap.ptr = ptr;
01710             STR_SET_NOEMBED(str);
01711         }
01712         else if (len <= RSTRING_EMBED_LEN_MAX) {
01713             char *ptr = RSTRING(str)->as.heap.ptr;
01714             STR_SET_EMBED(str);
01715             if (slen > len) slen = len;
01716             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01717             RSTRING(str)->as.ary[len] = '\0';
01718             STR_SET_EMBED_LEN(str, len);
01719             xfree(ptr);
01720             return str;
01721         }
01722         else if (slen < len || slen - len > 1024) {
01723             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01724         }
01725         if (!STR_NOCAPA_P(str)) {
01726             RSTRING(str)->as.heap.aux.capa = len;
01727         }
01728         RSTRING(str)->as.heap.len = len;
01729         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
01730     }
01731     return str;
01732 }
01733 
01734 static VALUE
01735 str_buf_cat(VALUE str, const char *ptr, long len)
01736 {
01737     long capa, total, off = -1;
01738 
01739     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01740         off = ptr - RSTRING_PTR(str);
01741     }
01742     rb_str_modify(str);
01743     if (len == 0) return 0;
01744     if (STR_ASSOC_P(str)) {
01745         FL_UNSET(str, STR_ASSOC);
01746         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01747     }
01748     else if (STR_EMBED_P(str)) {
01749         capa = RSTRING_EMBED_LEN_MAX;
01750     }
01751     else {
01752         capa = RSTRING(str)->as.heap.aux.capa;
01753     }
01754     if (RSTRING_LEN(str) >= LONG_MAX - len) {
01755         rb_raise(rb_eArgError, "string sizes too big");
01756     }
01757     total = RSTRING_LEN(str)+len;
01758     if (capa <= total) {
01759         while (total > capa) {
01760             if (capa + 1 >= LONG_MAX / 2) {
01761                 capa = (total + 4095) / 4096;
01762                 break;
01763             }
01764             capa = (capa + 1) * 2;
01765         }
01766         RESIZE_CAPA(str, capa);
01767     }
01768     if (off != -1) {
01769         ptr = RSTRING_PTR(str) + off;
01770     }
01771     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01772     STR_SET_LEN(str, total);
01773     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
01774 
01775     return str;
01776 }
01777 
01778 #define str_buf_cat2(str, ptr) str_buf_cat(str, (ptr), strlen(ptr))
01779 
01780 VALUE
01781 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01782 {
01783     if (len == 0) return str;
01784     if (len < 0) {
01785         rb_raise(rb_eArgError, "negative string size (or size too big)");
01786     }
01787     return str_buf_cat(str, ptr, len);
01788 }
01789 
01790 VALUE
01791 rb_str_buf_cat2(VALUE str, const char *ptr)
01792 {
01793     return rb_str_buf_cat(str, ptr, strlen(ptr));
01794 }
01795 
01796 VALUE
01797 rb_str_cat(VALUE str, const char *ptr, long len)
01798 {
01799     if (len < 0) {
01800         rb_raise(rb_eArgError, "negative string size (or size too big)");
01801     }
01802     if (STR_ASSOC_P(str)) {
01803         rb_str_modify(str);
01804         if (STR_EMBED_P(str)) str_make_independent(str);
01805         REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
01806         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
01807         RSTRING(str)->as.heap.len += len;
01808         RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0'; /* sentinel */
01809         return str;
01810     }
01811 
01812     return rb_str_buf_cat(str, ptr, len);
01813 }
01814 
01815 VALUE
01816 rb_str_cat2(VALUE str, const char *ptr)
01817 {
01818     return rb_str_cat(str, ptr, strlen(ptr));
01819 }
01820 
01821 static VALUE
01822 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01823     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01824 {
01825     int str_encindex = ENCODING_GET(str);
01826     int res_encindex;
01827     int str_cr, res_cr;
01828     int str_a8 = ENCODING_IS_ASCII8BIT(str);
01829     int ptr_a8 = ptr_encindex == 0;
01830 
01831     str_cr = ENC_CODERANGE(str);
01832 
01833     if (str_encindex == ptr_encindex) {
01834         if (str_cr == ENC_CODERANGE_UNKNOWN ||
01835             (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
01836             ptr_cr = ENC_CODERANGE_UNKNOWN;
01837         }
01838         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01839             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01840         }
01841     }
01842     else {
01843         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01844         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01845         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01846             if (len == 0)
01847                 return str;
01848             if (RSTRING_LEN(str) == 0) {
01849                 rb_str_buf_cat(str, ptr, len);
01850                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01851                 return str;
01852             }
01853             goto incompatible;
01854         }
01855         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01856             ptr_cr = coderange_scan(ptr, len, ptr_enc);
01857         }
01858         if (str_cr == ENC_CODERANGE_UNKNOWN) {
01859             if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
01860                 str_cr = rb_enc_str_coderange(str);
01861             }
01862         }
01863     }
01864     if (ptr_cr_ret)
01865         *ptr_cr_ret = ptr_cr;
01866 
01867     if (str_encindex != ptr_encindex &&
01868         str_cr != ENC_CODERANGE_7BIT &&
01869         ptr_cr != ENC_CODERANGE_7BIT) {
01870       incompatible:
01871         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01872             rb_enc_name(rb_enc_from_index(str_encindex)),
01873             rb_enc_name(rb_enc_from_index(ptr_encindex)));
01874     }
01875 
01876     if (str_cr == ENC_CODERANGE_UNKNOWN) {
01877         res_encindex = str_encindex;
01878         res_cr = ENC_CODERANGE_UNKNOWN;
01879     }
01880     else if (str_cr == ENC_CODERANGE_7BIT) {
01881         if (ptr_cr == ENC_CODERANGE_7BIT) {
01882             res_encindex = !str_a8 ? str_encindex : ptr_encindex;
01883             res_cr = ENC_CODERANGE_7BIT;
01884         }
01885         else {
01886             res_encindex = ptr_encindex;
01887             res_cr = ptr_cr;
01888         }
01889     }
01890     else if (str_cr == ENC_CODERANGE_VALID) {
01891         res_encindex = str_encindex;
01892         if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
01893             res_cr = str_cr;
01894         else
01895             res_cr = ptr_cr;
01896     }
01897     else { /* str_cr == ENC_CODERANGE_BROKEN */
01898         res_encindex = str_encindex;
01899         res_cr = str_cr;
01900         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01901     }
01902 
01903     if (len < 0) {
01904         rb_raise(rb_eArgError, "negative string size (or size too big)");
01905     }
01906     str_buf_cat(str, ptr, len);
01907     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01908     return str;
01909 }
01910 
01911 VALUE
01912 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
01913 {
01914     return rb_enc_cr_str_buf_cat(str, ptr, len,
01915         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
01916 }
01917 
01918 VALUE
01919 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
01920 {
01921     /* ptr must reference NUL terminated ASCII string. */
01922     int encindex = ENCODING_GET(str);
01923     rb_encoding *enc = rb_enc_from_index(encindex);
01924     if (rb_enc_asciicompat(enc)) {
01925         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
01926             encindex, ENC_CODERANGE_7BIT, 0);
01927     }
01928     else {
01929         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
01930         while (*ptr) {
01931             unsigned int c = (unsigned char)*ptr;
01932             int len = rb_enc_codelen(c, enc);
01933             rb_enc_mbcput(c, buf, enc);
01934             rb_enc_cr_str_buf_cat(str, buf, len,
01935                 encindex, ENC_CODERANGE_VALID, 0);
01936             ptr++;
01937         }
01938         return str;
01939     }
01940 }
01941 
01942 VALUE
01943 rb_str_buf_append(VALUE str, VALUE str2)
01944 {
01945     int str2_cr;
01946 
01947     str2_cr = ENC_CODERANGE(str2);
01948 
01949     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
01950         ENCODING_GET(str2), str2_cr, &str2_cr);
01951 
01952     OBJ_INFECT(str, str2);
01953     ENC_CODERANGE_SET(str2, str2_cr);
01954 
01955     return str;
01956 }
01957 
01958 VALUE
01959 rb_str_append(VALUE str, VALUE str2)
01960 {
01961     rb_encoding *enc;
01962     int cr, cr2;
01963 
01964     StringValue(str2);
01965     if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
01966         long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
01967         enc = rb_enc_check(str, str2);
01968         cr = ENC_CODERANGE(str);
01969         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
01970         rb_str_modify(str);
01971         REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01972         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
01973                RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
01974         RSTRING(str)->as.heap.len = len;
01975         rb_enc_associate(str, enc);
01976         ENC_CODERANGE_SET(str, cr);
01977         OBJ_INFECT(str, str2);
01978         return str;
01979     }
01980     return rb_str_buf_append(str, str2);
01981 }
01982 
01983 
01984 /*
01985  *  call-seq:
01986  *     str << integer       -> str
01987  *     str.concat(integer)  -> str
01988  *     str << obj           -> str
01989  *     str.concat(obj)      -> str
01990  *
01991  *  Append---Concatenates the given object to <i>str</i>. If the object is a
01992  *  <code>Integer</code>, it is considered as a codepoint, and is converted
01993  *  to a character before concatenation.
01994  *
01995  *     a = "hello "
01996  *     a << "world"   #=> "hello world"
01997  *     a.concat(33)   #=> "hello world!"
01998  */
01999 
02000 VALUE
02001 rb_str_concat(VALUE str1, VALUE str2)
02002 {
02003     unsigned int lc;
02004 
02005     if (FIXNUM_P(str2)) {
02006         if ((int)str2 < 0)
02007             rb_raise(rb_eRangeError, "negative argument");
02008         lc = FIX2UINT(str2);
02009     }
02010     else if (TYPE(str2) == T_BIGNUM) {
02011         if (!RBIGNUM_SIGN(str2))
02012             rb_raise(rb_eRangeError, "negative argument");
02013         lc = NUM2UINT(str2);
02014     }
02015     else {
02016         return rb_str_append(str1, str2);
02017     }
02018 #if SIZEOF_INT < SIZEOF_VALUE
02019     if ((VALUE)lc > UINT_MAX) {
02020         rb_raise(rb_eRangeError, "%"PRIuVALUE" out of char range", lc);
02021     }
02022 #endif
02023     {
02024         rb_encoding *enc = STR_ENC_GET(str1);
02025         long pos = RSTRING_LEN(str1);
02026         int cr = ENC_CODERANGE(str1);
02027         int len;
02028 
02029         if ((len = rb_enc_codelen(lc, enc)) <= 0) {
02030             rb_raise(rb_eRangeError, "%u invalid char", lc);
02031         }
02032         rb_str_resize(str1, pos+len);
02033         rb_enc_mbcput(lc, RSTRING_PTR(str1)+pos, enc);
02034         if (cr == ENC_CODERANGE_7BIT && lc > 127)
02035             cr = ENC_CODERANGE_VALID;
02036         ENC_CODERANGE_SET(str1, cr);
02037         return str1;
02038     }
02039 }
02040 
02041 st_index_t
02042 rb_memhash(const void *ptr, long len)
02043 {
02044     return st_hash(ptr, len, rb_hash_start(0));
02045 }
02046 
02047 st_index_t
02048 rb_str_hash(VALUE str)
02049 {
02050     int e = ENCODING_GET(str);
02051     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02052         e = 0;
02053     }
02054     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02055 }
02056 
02057 int
02058 rb_str_hash_cmp(VALUE str1, VALUE str2)
02059 {
02060     long len;
02061 
02062     if (!rb_str_comparable(str1, str2)) return 1;
02063     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02064         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02065         return 0;
02066     }
02067     return 1;
02068 }
02069 
02070 /*
02071  * call-seq:
02072  *    str.hash   -> fixnum
02073  *
02074  * Return a hash based on the string's length and content.
02075  */
02076 
02077 static VALUE
02078 rb_str_hash_m(VALUE str)
02079 {
02080     st_index_t hval = rb_str_hash(str);
02081     return INT2FIX(hval);
02082 }
02083 
02084 #define lesser(a,b) (((a)>(b))?(b):(a))
02085 
02086 int
02087 rb_str_comparable(VALUE str1, VALUE str2)
02088 {
02089     int idx1, idx2;
02090     int rc1, rc2;
02091 
02092     if (RSTRING_LEN(str1) == 0) return TRUE;
02093     if (RSTRING_LEN(str2) == 0) return TRUE;
02094     idx1 = ENCODING_GET(str1);
02095     idx2 = ENCODING_GET(str2);
02096     if (idx1 == idx2) return TRUE;
02097     rc1 = rb_enc_str_coderange(str1);
02098     rc2 = rb_enc_str_coderange(str2);
02099     if (rc1 == ENC_CODERANGE_7BIT) {
02100         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02101         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02102             return TRUE;
02103     }
02104     if (rc2 == ENC_CODERANGE_7BIT) {
02105         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02106             return TRUE;
02107     }
02108     return FALSE;
02109 }
02110 
02111 int
02112 rb_str_cmp(VALUE str1, VALUE str2)
02113 {
02114     long len;
02115     int retval;
02116 
02117     len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
02118     retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
02119     if (retval == 0) {
02120         if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
02121             if (!rb_str_comparable(str1, str2)) {
02122                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02123                     return 1;
02124                 return -1;
02125             }
02126             return 0;
02127         }
02128         if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
02129         return -1;
02130     }
02131     if (retval > 0) return 1;
02132     return -1;
02133 }
02134 
02135 /* expect tail call optimization */
02136 static VALUE
02137 str_eql(const VALUE str1, const VALUE str2)
02138 {
02139     const long len = RSTRING_LEN(str1);
02140 
02141     if (len != RSTRING_LEN(str2)) return Qfalse;
02142     if (!rb_str_comparable(str1, str2)) return Qfalse;
02143     if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0)
02144         return Qtrue;
02145     return Qfalse;
02146 }
02147 /*
02148  *  call-seq:
02149  *     str == obj   -> true or false
02150  *
02151  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
02152  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
02153  *  <code><=></code> <i>obj</i> returns zero.
02154  */
02155 
02156 VALUE
02157 rb_str_equal(VALUE str1, VALUE str2)
02158 {
02159     if (str1 == str2) return Qtrue;
02160     if (TYPE(str2) != T_STRING) {
02161         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02162             return Qfalse;
02163         }
02164         return rb_equal(str2, str1);
02165     }
02166     return str_eql(str1, str2);
02167 }
02168 
02169 /*
02170  * call-seq:
02171  *   str.eql?(other)   -> true or false
02172  *
02173  * Two strings are equal if they have the same length and content.
02174  */
02175 
02176 static VALUE
02177 rb_str_eql(VALUE str1, VALUE str2)
02178 {
02179     if (TYPE(str2) != T_STRING) return Qfalse;
02180     return str_eql(str1, str2);
02181 }
02182 
02183 /*
02184  *  call-seq:
02185  *     str <=> other_str   -> -1, 0, +1 or nil
02186  *
02187  *  Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if
02188  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than
02189  *  <i>str</i>. If the strings are of different lengths, and the strings are
02190  *  equal when compared up to the shortest length, then the longer string is
02191  *  considered greater than the shorter one. In older versions of Ruby, setting
02192  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
02193  *  in favor of using <code>String#casecmp</code>.
02194  *
02195  *  <code><=></code> is the basis for the methods <code><</code>,
02196  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
02197  *  included from module <code>Comparable</code>.  The method
02198  *  <code>String#==</code> does not use <code>Comparable#==</code>.
02199  *
02200  *     "abcdef" <=> "abcde"     #=> 1
02201  *     "abcdef" <=> "abcdef"    #=> 0
02202  *     "abcdef" <=> "abcdefg"   #=> -1
02203  *     "abcdef" <=> "ABCDEF"    #=> 1
02204  */
02205 
02206 static VALUE
02207 rb_str_cmp_m(VALUE str1, VALUE str2)
02208 {
02209     long result;
02210 
02211     if (TYPE(str2) != T_STRING) {
02212         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02213             return Qnil;
02214         }
02215         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02216             return Qnil;
02217         }
02218         else {
02219             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02220 
02221             if (NIL_P(tmp)) return Qnil;
02222             if (!FIXNUM_P(tmp)) {
02223                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02224             }
02225             result = -FIX2LONG(tmp);
02226         }
02227     }
02228     else {
02229         result = rb_str_cmp(str1, str2);
02230     }
02231     return LONG2NUM(result);
02232 }
02233 
02234 /*
02235  *  call-seq:
02236  *     str.casecmp(other_str)   -> -1, 0, +1 or nil
02237  *
02238  *  Case-insensitive version of <code>String#<=></code>.
02239  *
02240  *     "abcdef".casecmp("abcde")     #=> 1
02241  *     "aBcDeF".casecmp("abcdef")    #=> 0
02242  *     "abcdef".casecmp("abcdefg")   #=> -1
02243  *     "abcdef".casecmp("ABCDEF")    #=> 0
02244  */
02245 
02246 static VALUE
02247 rb_str_casecmp(VALUE str1, VALUE str2)
02248 {
02249     long len;
02250     rb_encoding *enc;
02251     char *p1, *p1end, *p2, *p2end;
02252 
02253     StringValue(str2);
02254     enc = rb_enc_compatible(str1, str2);
02255     if (!enc) {
02256         return Qnil;
02257     }
02258 
02259     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02260     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02261     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02262         while (p1 < p1end && p2 < p2end) {
02263             if (*p1 != *p2) {
02264                 unsigned int c1 = TOUPPER(*p1 & 0xff);
02265                 unsigned int c2 = TOUPPER(*p2 & 0xff);
02266                 if (c1 != c2)
02267                     return INT2FIX(c1 < c2 ? -1 : 1);
02268             }
02269             p1++;
02270             p2++;
02271         }
02272     }
02273     else {
02274         while (p1 < p1end && p2 < p2end) {
02275             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02276             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02277 
02278             if (0 <= c1 && 0 <= c2) {
02279                 c1 = TOUPPER(c1);
02280                 c2 = TOUPPER(c2);
02281                 if (c1 != c2)
02282                     return INT2FIX(c1 < c2 ? -1 : 1);
02283             }
02284             else {
02285                 int r;
02286                 l1 = rb_enc_mbclen(p1, p1end, enc);
02287                 l2 = rb_enc_mbclen(p2, p2end, enc);
02288                 len = l1 < l2 ? l1 : l2;
02289                 r = memcmp(p1, p2, len);
02290                 if (r != 0)
02291                     return INT2FIX(r < 0 ? -1 : 1);
02292                 if (l1 != l2)
02293                     return INT2FIX(l1 < l2 ? -1 : 1);
02294             }
02295             p1 += l1;
02296             p2 += l2;
02297         }
02298     }
02299     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02300     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02301     return INT2FIX(-1);
02302 }
02303 
02304 static long
02305 rb_str_index(VALUE str, VALUE sub, long offset)
02306 {
02307     long pos;
02308     char *s, *sptr, *e;
02309     long len, slen;
02310     rb_encoding *enc;
02311 
02312     enc = rb_enc_check(str, sub);
02313     if (is_broken_string(sub)) {
02314         return -1;
02315     }
02316     len = str_strlen(str, enc);
02317     slen = str_strlen(sub, enc);
02318     if (offset < 0) {
02319         offset += len;
02320         if (offset < 0) return -1;
02321     }
02322     if (len - offset < slen) return -1;
02323     s = RSTRING_PTR(str);
02324     e = s + RSTRING_LEN(str);
02325     if (offset) {
02326         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02327         s += offset;
02328     }
02329     if (slen == 0) return offset;
02330     /* need proceed one character at a time */
02331     sptr = RSTRING_PTR(sub);
02332     slen = RSTRING_LEN(sub);
02333     len = RSTRING_LEN(str) - offset;
02334     for (;;) {
02335         char *t;
02336         pos = rb_memsearch(sptr, slen, s, len, enc);
02337         if (pos < 0) return pos;
02338         t = rb_enc_right_char_head(s, s+pos, e, enc);
02339         if (t == s + pos) break;
02340         if ((len -= t - s) <= 0) return -1;
02341         offset += t - s;
02342         s = t;
02343     }
02344     return pos + offset;
02345 }
02346 
02347 
02348 /*
02349  *  call-seq:
02350  *     str.index(substring [, offset])   -> fixnum or nil
02351  *     str.index(regexp [, offset])      -> fixnum or nil
02352  *
02353  *  Returns the index of the first occurrence of the given <i>substring</i> or
02354  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02355  *  found. If the second parameter is present, it specifies the position in the
02356  *  string to begin the search.
02357  *
02358  *     "hello".index('e')             #=> 1
02359  *     "hello".index('lo')            #=> 3
02360  *     "hello".index('a')             #=> nil
02361  *     "hello".index(?e)              #=> 1
02362  *     "hello".index(/[aeiou]/, -3)   #=> 4
02363  */
02364 
02365 static VALUE
02366 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02367 {
02368     VALUE sub;
02369     VALUE initpos;
02370     long pos;
02371 
02372     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02373         pos = NUM2LONG(initpos);
02374     }
02375     else {
02376         pos = 0;
02377     }
02378     if (pos < 0) {
02379         pos += str_strlen(str, STR_ENC_GET(str));
02380         if (pos < 0) {
02381             if (TYPE(sub) == T_REGEXP) {
02382                 rb_backref_set(Qnil);
02383             }
02384             return Qnil;
02385         }
02386     }
02387 
02388     switch (TYPE(sub)) {
02389       case T_REGEXP:
02390         if (pos > str_strlen(str, STR_ENC_GET(str)))
02391             return Qnil;
02392         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02393                          rb_enc_check(str, sub), single_byte_optimizable(str));
02394 
02395         pos = rb_reg_search(sub, str, pos, 0);
02396         pos = rb_str_sublen(str, pos);
02397         break;
02398 
02399       default: {
02400         VALUE tmp;
02401 
02402         tmp = rb_check_string_type(sub);
02403         if (NIL_P(tmp)) {
02404             rb_raise(rb_eTypeError, "type mismatch: %s given",
02405                      rb_obj_classname(sub));
02406         }
02407         sub = tmp;
02408       }
02409         /* fall through */
02410       case T_STRING:
02411         pos = rb_str_index(str, sub, pos);
02412         pos = rb_str_sublen(str, pos);
02413         break;
02414     }
02415 
02416     if (pos == -1) return Qnil;
02417     return LONG2NUM(pos);
02418 }
02419 
02420 static long
02421 rb_str_rindex(VALUE str, VALUE sub, long pos)
02422 {
02423     long len, slen;
02424     char *s, *sbeg, *e, *t;
02425     rb_encoding *enc;
02426     int singlebyte = single_byte_optimizable(str);
02427 
02428     enc = rb_enc_check(str, sub);
02429     if (is_broken_string(sub)) {
02430         return -1;
02431     }
02432     len = str_strlen(str, enc);
02433     slen = str_strlen(sub, enc);
02434     /* substring longer than string */
02435     if (len < slen) return -1;
02436     if (len - pos < slen) {
02437         pos = len - slen;
02438     }
02439     if (len == 0) {
02440         return pos;
02441     }
02442     sbeg = RSTRING_PTR(str);
02443     e = RSTRING_END(str);
02444     t = RSTRING_PTR(sub);
02445     slen = RSTRING_LEN(sub);
02446     for (;;) {
02447         s = str_nth(sbeg, e, pos, enc, singlebyte);
02448         if (!s) return -1;
02449         if (memcmp(s, t, slen) == 0) {
02450             return pos;
02451         }
02452         if (pos == 0) break;
02453         pos--;
02454     }
02455     return -1;
02456 }
02457 
02458 
02459 /*
02460  *  call-seq:
02461  *     str.rindex(substring [, fixnum])   -> fixnum or nil
02462  *     str.rindex(regexp [, fixnum])   -> fixnum or nil
02463  *
02464  *  Returns the index of the last occurrence of the given <i>substring</i> or
02465  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02466  *  found. If the second parameter is present, it specifies the position in the
02467  *  string to end the search---characters beyond this point will not be
02468  *  considered.
02469  *
02470  *     "hello".rindex('e')             #=> 1
02471  *     "hello".rindex('l')             #=> 3
02472  *     "hello".rindex('a')             #=> nil
02473  *     "hello".rindex(?e)              #=> 1
02474  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
02475  */
02476 
02477 static VALUE
02478 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02479 {
02480     VALUE sub;
02481     VALUE vpos;
02482     rb_encoding *enc = STR_ENC_GET(str);
02483     long pos, len = str_strlen(str, enc);
02484 
02485     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02486         pos = NUM2LONG(vpos);
02487         if (pos < 0) {
02488             pos += len;
02489             if (pos < 0) {
02490                 if (TYPE(sub) == T_REGEXP) {
02491                     rb_backref_set(Qnil);
02492                 }
02493                 return Qnil;
02494             }
02495         }
02496         if (pos > len) pos = len;
02497     }
02498     else {
02499         pos = len;
02500     }
02501 
02502     switch (TYPE(sub)) {
02503       case T_REGEXP:
02504         /* enc = rb_get_check(str, sub); */
02505         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02506                          STR_ENC_GET(str), single_byte_optimizable(str));
02507 
02508         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02509             pos = rb_reg_search(sub, str, pos, 1);
02510             pos = rb_str_sublen(str, pos);
02511         }
02512         if (pos >= 0) return LONG2NUM(pos);
02513         break;
02514 
02515       default: {
02516         VALUE tmp;
02517 
02518         tmp = rb_check_string_type(sub);
02519         if (NIL_P(tmp)) {
02520             rb_raise(rb_eTypeError, "type mismatch: %s given",
02521                      rb_obj_classname(sub));
02522         }
02523         sub = tmp;
02524       }
02525         /* fall through */
02526       case T_STRING:
02527         pos = rb_str_rindex(str, sub, pos);
02528         if (pos >= 0) return LONG2NUM(pos);
02529         break;
02530     }
02531     return Qnil;
02532 }
02533 
02534 /*
02535  *  call-seq:
02536  *     str =~ obj   -> fixnum or nil
02537  *
02538  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
02539  *  against <i>str</i>,and returns the position the match starts, or
02540  *  <code>nil</code> if there is no match. Otherwise, invokes
02541  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
02542  *  <code>=~</code> in <code>Object</code> returns <code>nil</code>.
02543  *
02544  *     "cat o' 9 tails" =~ /\d/   #=> 7
02545  *     "cat o' 9 tails" =~ 9      #=> nil
02546  */
02547 
02548 static VALUE
02549 rb_str_match(VALUE x, VALUE y)
02550 {
02551     switch (TYPE(y)) {
02552       case T_STRING:
02553         rb_raise(rb_eTypeError, "type mismatch: String given");
02554 
02555       case T_REGEXP:
02556         return rb_reg_match(y, x);
02557 
02558       default:
02559         return rb_funcall(y, rb_intern("=~"), 1, x);
02560     }
02561 }
02562 
02563 
02564 static VALUE get_pat(VALUE, int);
02565 
02566 
02567 /*
02568  *  call-seq:
02569  *     str.match(pattern)        -> matchdata or nil
02570  *     str.match(pattern, pos)   -> matchdata or nil
02571  *
02572  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
02573  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
02574  *  parameter is present, it specifies the position in the string to begin the
02575  *  search.
02576  *  If the second parameter is present, it specifies the position in the string
02577  *  to begin the search.
02578  *
02579  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
02580  *     'hello'.match('(.)\1')[0]   #=> "ll"
02581  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
02582  *     'hello'.match('xx')         #=> nil
02583  *
02584  *  If a block is given, invoke the block with MatchData if match succeed, so
02585  *  that you can write
02586  *
02587  *     str.match(pat) {|m| ...}
02588  *
02589  *  instead of
02590  *
02591  *     if m = str.match(pat)
02592  *       ...
02593  *     end
02594  *
02595  *  The return value is a value from block execution in this case.
02596  */
02597 
02598 static VALUE
02599 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02600 {
02601     VALUE re, result;
02602     if (argc < 1)
02603        rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02604     re = argv[0];
02605     argv[0] = str;
02606     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02607     if (!NIL_P(result) && rb_block_given_p()) {
02608         return rb_yield(result);
02609     }
02610     return result;
02611 }
02612 
02613 enum neighbor_char {
02614     NEIGHBOR_NOT_CHAR,
02615     NEIGHBOR_FOUND,
02616     NEIGHBOR_WRAPPED
02617 };
02618 
02619 static enum neighbor_char
02620 enc_succ_char(char *p, long len, rb_encoding *enc)
02621 {
02622     long i;
02623     int l;
02624     while (1) {
02625         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02626             p[i] = '\0';
02627         if (i < 0)
02628             return NEIGHBOR_WRAPPED;
02629         ++((unsigned char*)p)[i];
02630         l = rb_enc_precise_mbclen(p, p+len, enc);
02631         if (MBCLEN_CHARFOUND_P(l)) {
02632             l = MBCLEN_CHARFOUND_LEN(l);
02633             if (l == len) {
02634                 return NEIGHBOR_FOUND;
02635             }
02636             else {
02637                 memset(p+l, 0xff, len-l);
02638             }
02639         }
02640         if (MBCLEN_INVALID_P(l) && i < len-1) {
02641             long len2;
02642             int l2;
02643             for (len2 = len-1; 0 < len2; len2--) {
02644                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02645                 if (!MBCLEN_INVALID_P(l2))
02646                     break;
02647             }
02648             memset(p+len2+1, 0xff, len-(len2+1));
02649         }
02650     }
02651 }
02652 
02653 static enum neighbor_char
02654 enc_pred_char(char *p, long len, rb_encoding *enc)
02655 {
02656     long i;
02657     int l;
02658     while (1) {
02659         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02660             p[i] = '\xff';
02661         if (i < 0)
02662             return NEIGHBOR_WRAPPED;
02663         --((unsigned char*)p)[i];
02664         l = rb_enc_precise_mbclen(p, p+len, enc);
02665         if (MBCLEN_CHARFOUND_P(l)) {
02666             l = MBCLEN_CHARFOUND_LEN(l);
02667             if (l == len) {
02668                 return NEIGHBOR_FOUND;
02669             }
02670             else {
02671                 memset(p+l, 0, len-l);
02672             }
02673         }
02674         if (MBCLEN_INVALID_P(l) && i < len-1) {
02675             long len2;
02676             int l2;
02677             for (len2 = len-1; 0 < len2; len2--) {
02678                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02679                 if (!MBCLEN_INVALID_P(l2))
02680                     break;
02681             }
02682             memset(p+len2+1, 0, len-(len2+1));
02683         }
02684     }
02685 }
02686 
02687 /*
02688   overwrite +p+ by succeeding letter in +enc+ and returns
02689   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
02690   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
02691   assuming each ranges are successive, and mbclen
02692   never change in each ranges.
02693   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
02694   character.
02695  */
02696 static enum neighbor_char
02697 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02698 {
02699     enum neighbor_char ret;
02700     unsigned int c;
02701     int ctype;
02702     int range;
02703     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02704 
02705     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02706     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02707         ctype = ONIGENC_CTYPE_DIGIT;
02708     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02709         ctype = ONIGENC_CTYPE_ALPHA;
02710     else
02711         return NEIGHBOR_NOT_CHAR;
02712 
02713     MEMCPY(save, p, char, len);
02714     ret = enc_succ_char(p, len, enc);
02715     if (ret == NEIGHBOR_FOUND) {
02716         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02717         if (rb_enc_isctype(c, ctype, enc))
02718             return NEIGHBOR_FOUND;
02719     }
02720     MEMCPY(p, save, char, len);
02721     range = 1;
02722     while (1) {
02723         MEMCPY(save, p, char, len);
02724         ret = enc_pred_char(p, len, enc);
02725         if (ret == NEIGHBOR_FOUND) {
02726             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02727             if (!rb_enc_isctype(c, ctype, enc)) {
02728                 MEMCPY(p, save, char, len);
02729                 break;
02730             }
02731         }
02732         else {
02733             MEMCPY(p, save, char, len);
02734             break;
02735         }
02736         range++;
02737     }
02738     if (range == 1) {
02739         return NEIGHBOR_NOT_CHAR;
02740     }
02741 
02742     if (ctype != ONIGENC_CTYPE_DIGIT) {
02743         MEMCPY(carry, p, char, len);
02744         return NEIGHBOR_WRAPPED;
02745     }
02746 
02747     MEMCPY(carry, p, char, len);
02748     enc_succ_char(carry, len, enc);
02749     return NEIGHBOR_WRAPPED;
02750 }
02751 
02752 
02753 /*
02754  *  call-seq:
02755  *     str.succ   -> new_str
02756  *     str.next   -> new_str
02757  *
02758  *  Returns the successor to <i>str</i>. The successor is calculated by
02759  *  incrementing characters starting from the rightmost alphanumeric (or
02760  *  the rightmost character if there are no alphanumerics) in the
02761  *  string. Incrementing a digit always results in another digit, and
02762  *  incrementing a letter results in another letter of the same case.
02763  *  Incrementing nonalphanumerics uses the underlying character set's
02764  *  collating sequence.
02765  *
02766  *  If the increment generates a ``carry,'' the character to the left of
02767  *  it is incremented. This process repeats until there is no carry,
02768  *  adding an additional character if necessary.
02769  *
02770  *     "abcd".succ        #=> "abce"
02771  *     "THX1138".succ     #=> "THX1139"
02772  *     "<<koala>>".succ   #=> "<<koalb>>"
02773  *     "1999zzz".succ     #=> "2000aaa"
02774  *     "ZZZ9999".succ     #=> "AAAA0000"
02775  *     "***".succ         #=> "**+"
02776  */
02777 
02778 VALUE
02779 rb_str_succ(VALUE orig)
02780 {
02781     rb_encoding *enc;
02782     VALUE str;
02783     char *sbeg, *s, *e, *last_alnum = 0;
02784     int c = -1;
02785     long l;
02786     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02787     long carry_pos = 0, carry_len = 1;
02788     enum neighbor_char neighbor = NEIGHBOR_FOUND;
02789 
02790     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02791     rb_enc_cr_str_copy_for_substr(str, orig);
02792     OBJ_INFECT(str, orig);
02793     if (RSTRING_LEN(str) == 0) return str;
02794 
02795     enc = STR_ENC_GET(orig);
02796     sbeg = RSTRING_PTR(str);
02797     s = e = sbeg + RSTRING_LEN(str);
02798 
02799     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02800         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02801             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02802                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02803                 s = last_alnum;
02804                 break;
02805             }
02806         }
02807         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02808         neighbor = enc_succ_alnum_char(s, l, enc, carry);
02809         switch (neighbor) {
02810           case NEIGHBOR_NOT_CHAR:
02811             continue;
02812           case NEIGHBOR_FOUND:
02813             return str;
02814           case NEIGHBOR_WRAPPED:
02815             last_alnum = s;
02816             break;
02817         }
02818         c = 1;
02819         carry_pos = s - sbeg;
02820         carry_len = l;
02821     }
02822     if (c == -1) {              /* str contains no alnum */
02823         s = e;
02824         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02825             enum neighbor_char neighbor;
02826             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02827             neighbor = enc_succ_char(s, l, enc);
02828             if (neighbor == NEIGHBOR_FOUND)
02829                 return str;
02830             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02831                 /* wrapped to \0...\0.  search next valid char. */
02832                 enc_succ_char(s, l, enc);
02833             }
02834             if (!rb_enc_asciicompat(enc)) {
02835                 MEMCPY(carry, s, char, l);
02836                 carry_len = l;
02837             }
02838             carry_pos = s - sbeg;
02839         }
02840     }
02841     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02842     s = RSTRING_PTR(str) + carry_pos;
02843     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02844     memmove(s, carry, carry_len);
02845     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02846     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02847     rb_enc_str_coderange(str);
02848     return str;
02849 }
02850 
02851 
02852 /*
02853  *  call-seq:
02854  *     str.succ!   -> str
02855  *     str.next!   -> str
02856  *
02857  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
02858  *  place.
02859  */
02860 
02861 static VALUE
02862 rb_str_succ_bang(VALUE str)
02863 {
02864     rb_str_shared_replace(str, rb_str_succ(str));
02865 
02866     return str;
02867 }
02868 
02869 
02870 /*
02871  *  call-seq:
02872  *     str.upto(other_str, exclusive=false) {|s| block }   -> str
02873  *     str.upto(other_str, exclusive=false)                -> an_enumerator
02874  *
02875  *  Iterates through successive values, starting at <i>str</i> and
02876  *  ending at <i>other_str</i> inclusive, passing each value in turn to
02877  *  the block. The <code>String#succ</code> method is used to generate
02878  *  each value.  If optional second argument exclusive is omitted or is false,
02879  *  the last value will be included; otherwise it will be excluded.
02880  *
02881  *  If no block is given, an enumerator is returned instead.
02882  *
02883  *     "a8".upto("b6") {|s| print s, ' ' }
02884  *     for s in "a8".."b6"
02885  *       print s, ' '
02886  *     end
02887  *
02888  *  <em>produces:</em>
02889  *
02890  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
02891  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
02892  *
02893  *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
02894  *  both are recognized as decimal numbers. In addition, the width of
02895  *  string (e.g. leading zeros) is handled appropriately.
02896  *
02897  *     "9".upto("11").to_a   #=> ["9", "10", "11"]
02898  *     "25".upto("5").to_a   #=> []
02899  *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
02900  */
02901 
02902 static VALUE
02903 rb_str_upto(int argc, VALUE *argv, VALUE beg)
02904 {
02905     VALUE end, exclusive;
02906     VALUE current, after_end;
02907     ID succ;
02908     int n, excl, ascii;
02909     rb_encoding *enc;
02910 
02911     rb_scan_args(argc, argv, "11", &end, &exclusive);
02912     RETURN_ENUMERATOR(beg, argc, argv);
02913     excl = RTEST(exclusive);
02914     CONST_ID(succ, "succ");
02915     StringValue(end);
02916     enc = rb_enc_check(beg, end);
02917     ascii = (is_ascii_string(beg) && is_ascii_string(end));
02918     /* single character */
02919     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
02920         char c = RSTRING_PTR(beg)[0];
02921         char e = RSTRING_PTR(end)[0];
02922 
02923         if (c > e || (excl && c == e)) return beg;
02924         for (;;) {
02925             rb_yield(rb_enc_str_new(&c, 1, enc));
02926             if (!excl && c == e) break;
02927             c++;
02928             if (excl && c == e) break;
02929         }
02930         return beg;
02931     }
02932     /* both edges are all digits */
02933     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
02934         char *s, *send;
02935         VALUE b, e;
02936         int width;
02937 
02938         s = RSTRING_PTR(beg); send = RSTRING_END(beg);
02939         width = rb_long2int(send - s);
02940         while (s < send) {
02941             if (!ISDIGIT(*s)) goto no_digits;
02942             s++;
02943         }
02944         s = RSTRING_PTR(end); send = RSTRING_END(end);
02945         while (s < send) {
02946             if (!ISDIGIT(*s)) goto no_digits;
02947             s++;
02948         }
02949         b = rb_str_to_inum(beg, 10, FALSE);
02950         e = rb_str_to_inum(end, 10, FALSE);
02951         if (FIXNUM_P(b) && FIXNUM_P(e)) {
02952             long bi = FIX2LONG(b);
02953             long ei = FIX2LONG(e);
02954             rb_encoding *usascii = rb_usascii_encoding();
02955 
02956             while (bi <= ei) {
02957                 if (excl && bi == ei) break;
02958                 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
02959                 bi++;
02960             }
02961         }
02962         else {
02963             ID op = excl ? '<' : rb_intern("<=");
02964             VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
02965 
02966             args[0] = INT2FIX(width);
02967             while (rb_funcall(b, op, 1, e)) {
02968                 args[1] = b;
02969                 rb_yield(rb_str_format(numberof(args), args, fmt));
02970                 b = rb_funcall(b, succ, 0, 0);
02971             }
02972         }
02973         return beg;
02974     }
02975     /* normal case */
02976   no_digits:
02977     n = rb_str_cmp(beg, end);
02978     if (n > 0 || (excl && n == 0)) return beg;
02979 
02980     after_end = rb_funcall(end, succ, 0, 0);
02981     current = rb_str_dup(beg);
02982     while (!rb_str_equal(current, after_end)) {
02983         VALUE next = Qnil;
02984         if (excl || !rb_str_equal(current, end))
02985             next = rb_funcall(current, succ, 0, 0);
02986         rb_yield(current);
02987         if (NIL_P(next)) break;
02988         current = next;
02989         StringValue(current);
02990         if (excl && rb_str_equal(current, end)) break;
02991         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
02992             break;
02993     }
02994 
02995     return beg;
02996 }
02997 
02998 static VALUE
02999 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03000 {
03001     if (rb_reg_search(re, str, 0, 0) >= 0) {
03002         VALUE match = rb_backref_get();
03003         int nth = rb_reg_backref_number(match, backref);
03004         return rb_reg_nth_match(nth, match);
03005     }
03006     return Qnil;
03007 }
03008 
03009 static VALUE
03010 rb_str_aref(VALUE str, VALUE indx)
03011 {
03012     long idx;
03013 
03014     switch (TYPE(indx)) {
03015       case T_FIXNUM:
03016         idx = FIX2LONG(indx);
03017 
03018       num_index:
03019         str = rb_str_substr(str, idx, 1);
03020         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03021         return str;
03022 
03023       case T_REGEXP:
03024         return rb_str_subpat(str, indx, INT2FIX(0));
03025 
03026       case T_STRING:
03027         if (rb_str_index(str, indx, 0) != -1)
03028             return rb_str_dup(indx);
03029         return Qnil;
03030 
03031       default:
03032         /* check if indx is Range */
03033         {
03034             long beg, len;
03035             VALUE tmp;
03036 
03037             len = str_strlen(str, STR_ENC_GET(str));
03038             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03039               case Qfalse:
03040                 break;
03041               case Qnil:
03042                 return Qnil;
03043               default:
03044                 tmp = rb_str_substr(str, beg, len);
03045                 return tmp;
03046             }
03047         }
03048         idx = NUM2LONG(indx);
03049         goto num_index;
03050     }
03051     return Qnil;                /* not reached */
03052 }
03053 
03054 
03055 /*
03056  *  call-seq:
03057  *     str[fixnum]                 -> new_str or nil
03058  *     str[fixnum, fixnum]         -> new_str or nil
03059  *     str[range]                  -> new_str or nil
03060  *     str[regexp]                 -> new_str or nil
03061  *     str[regexp, fixnum]         -> new_str or nil
03062  *     str[other_str]              -> new_str or nil
03063  *     str.slice(fixnum)           -> new_str or nil
03064  *     str.slice(fixnum, fixnum)   -> new_str or nil
03065  *     str.slice(range)            -> new_str or nil
03066  *     str.slice(regexp)           -> new_str or nil
03067  *     str.slice(regexp, fixnum)   -> new_str or nil
03068  *     str.slice(regexp, capname)  -> new_str or nil
03069  *     str.slice(other_str)        -> new_str or nil
03070  *
03071  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
03072  *  substring of one character at that position. If passed two <code>Fixnum</code>
03073  *  objects, returns a substring starting at the offset given by the first, and
03074  *  a length given by the second. If given a range, a substring containing
03075  *  characters at offsets given by the range is returned. In all three cases, if
03076  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
03077  *  <code>nil</code> if the initial offset falls outside the string, the length
03078  *  is negative, or the beginning of the range is greater than the end.
03079  *
03080  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
03081  *  returned. If a numeric or name parameter follows the regular expression, that
03082  *  component of the <code>MatchData</code> is returned instead. If a
03083  *  <code>String</code> is given, that string is returned if it occurs in
03084  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
03085  *  match.
03086  *
03087  *     a = "hello there"
03088  *     a[1]                   #=> "e"
03089  *     a[1,3]                 #=> "ell"
03090  *     a[1..3]                #=> "ell"
03091  *     a[-3,2]                #=> "er"
03092  *     a[-4..-2]              #=> "her"
03093  *     a[12..-1]              #=> nil
03094  *     a[-2..-4]              #=> ""
03095  *     a[/[aeiou](.)\1/]      #=> "ell"
03096  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
03097  *     a[/[aeiou](.)\1/, 1]   #=> "l"
03098  *     a[/[aeiou](.)\1/, 2]   #=> nil
03099  *     a["lo"]                #=> "lo"
03100  *     a["bye"]               #=> nil
03101  */
03102 
03103 static VALUE
03104 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03105 {
03106     if (argc == 2) {
03107         if (TYPE(argv[0]) == T_REGEXP) {
03108             return rb_str_subpat(str, argv[0], argv[1]);
03109         }
03110         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03111     }
03112     if (argc != 1) {
03113         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03114     }
03115     return rb_str_aref(str, argv[0]);
03116 }
03117 
03118 VALUE
03119 rb_str_drop_bytes(VALUE str, long len)
03120 {
03121     char *ptr = RSTRING_PTR(str);
03122     long olen = RSTRING_LEN(str), nlen;
03123 
03124     str_modifiable(str);
03125     if (len > olen) len = olen;
03126     nlen = olen - len;
03127     if (nlen <= RSTRING_EMBED_LEN_MAX) {
03128         char *oldptr = ptr;
03129         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03130         STR_SET_EMBED(str);
03131         STR_SET_EMBED_LEN(str, nlen);
03132         ptr = RSTRING(str)->as.ary;
03133         memmove(ptr, oldptr + len, nlen);
03134         if (fl == STR_NOEMBED) xfree(oldptr);
03135     }
03136     else {
03137         if (!STR_SHARED_P(str)) rb_str_new4(str);
03138         ptr = RSTRING(str)->as.heap.ptr += len;
03139         RSTRING(str)->as.heap.len = nlen;
03140     }
03141     ptr[nlen] = 0;
03142     ENC_CODERANGE_CLEAR(str);
03143     return str;
03144 }
03145 
03146 static void
03147 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03148 {
03149     if (beg == 0 && RSTRING_LEN(val) == 0) {
03150         rb_str_drop_bytes(str, len);
03151         OBJ_INFECT(str, val);
03152         return;
03153     }
03154 
03155     rb_str_modify(str);
03156     if (len < RSTRING_LEN(val)) {
03157         /* expand string */
03158         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03159     }
03160 
03161     if (RSTRING_LEN(val) != len) {
03162         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03163                 RSTRING_PTR(str) + beg + len,
03164                 RSTRING_LEN(str) - (beg + len));
03165     }
03166     if (RSTRING_LEN(val) < beg && len < 0) {
03167         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03168     }
03169     if (RSTRING_LEN(val) > 0) {
03170         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03171     }
03172     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03173     if (RSTRING_PTR(str)) {
03174         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03175     }
03176     OBJ_INFECT(str, val);
03177 }
03178 
03179 static void
03180 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03181 {
03182     long slen;
03183     char *p, *e;
03184     rb_encoding *enc;
03185     int singlebyte = single_byte_optimizable(str);
03186     int cr;
03187 
03188     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03189 
03190     StringValue(val);
03191     enc = rb_enc_check(str, val);
03192     slen = str_strlen(str, enc);
03193 
03194     if (slen < beg) {
03195       out_of_range:
03196         rb_raise(rb_eIndexError, "index %ld out of string", beg);
03197     }
03198     if (beg < 0) {
03199         if (-beg > slen) {
03200             goto out_of_range;
03201         }
03202         beg += slen;
03203     }
03204     if (slen < len || slen < beg + len) {
03205         len = slen - beg;
03206     }
03207     str_modify_keep_cr(str);
03208     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03209     if (!p) p = RSTRING_END(str);
03210     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03211     if (!e) e = RSTRING_END(str);
03212     /* error check */
03213     beg = p - RSTRING_PTR(str); /* physical position */
03214     len = e - p;                /* physical length */
03215     rb_str_splice_0(str, beg, len, val);
03216     rb_enc_associate(str, enc);
03217     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03218     if (cr != ENC_CODERANGE_BROKEN)
03219         ENC_CODERANGE_SET(str, cr);
03220 }
03221 
03222 void
03223 rb_str_update(VALUE str, long beg, long len, VALUE val)
03224 {
03225     rb_str_splice(str, beg, len, val);
03226 }
03227 
03228 static void
03229 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03230 {
03231     int nth;
03232     VALUE match;
03233     long start, end, len;
03234     rb_encoding *enc;
03235     struct re_registers *regs;
03236 
03237     if (rb_reg_search(re, str, 0, 0) < 0) {
03238         rb_raise(rb_eIndexError, "regexp not matched");
03239     }
03240     match = rb_backref_get();
03241     nth = rb_reg_backref_number(match, backref);
03242     regs = RMATCH_REGS(match);
03243     if (nth >= regs->num_regs) {
03244       out_of_range:
03245         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03246     }
03247     if (nth < 0) {
03248         if (-nth >= regs->num_regs) {
03249             goto out_of_range;
03250         }
03251         nth += regs->num_regs;
03252     }
03253 
03254     start = BEG(nth);
03255     if (start == -1) {
03256         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03257     }
03258     end = END(nth);
03259     len = end - start;
03260     StringValue(val);
03261     enc = rb_enc_check(str, val);
03262     rb_str_splice_0(str, start, len, val);
03263     rb_enc_associate(str, enc);
03264 }
03265 
03266 static VALUE
03267 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03268 {
03269     long idx, beg;
03270 
03271     switch (TYPE(indx)) {
03272       case T_FIXNUM:
03273         idx = FIX2LONG(indx);
03274       num_index:
03275         rb_str_splice(str, idx, 1, val);
03276         return val;
03277 
03278       case T_REGEXP:
03279         rb_str_subpat_set(str, indx, INT2FIX(0), val);
03280         return val;
03281 
03282       case T_STRING:
03283         beg = rb_str_index(str, indx, 0);
03284         if (beg < 0) {
03285             rb_raise(rb_eIndexError, "string not matched");
03286         }
03287         beg = rb_str_sublen(str, beg);
03288         rb_str_splice(str, beg, str_strlen(indx, 0), val);
03289         return val;
03290 
03291       default:
03292         /* check if indx is Range */
03293         {
03294             long beg, len;
03295             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03296                 rb_str_splice(str, beg, len, val);
03297                 return val;
03298             }
03299         }
03300         idx = NUM2LONG(indx);
03301         goto num_index;
03302     }
03303 }
03304 
03305 /*
03306  *  call-seq:
03307  *     str[fixnum] = new_str
03308  *     str[fixnum, fixnum] = new_str
03309  *     str[range] = aString
03310  *     str[regexp] = new_str
03311  *     str[regexp, fixnum] = new_str
03312  *     str[regexp, name] = new_str
03313  *     str[other_str] = new_str
03314  *
03315  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
03316  *  portion of the string affected is determined using the same criteria as
03317  *  <code>String#[]</code>. If the replacement string is not the same length as
03318  *  the text it is replacing, the string will be adjusted accordingly. If the
03319  *  regular expression or string is used as the index doesn't match a position
03320  *  in the string, <code>IndexError</code> is raised. If the regular expression
03321  *  form is used, the optional second <code>Fixnum</code> allows you to specify
03322  *  which portion of the match to replace (effectively using the
03323  *  <code>MatchData</code> indexing rules. The forms that take a
03324  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
03325  *  out of range; the <code>Range</code> form will raise a
03326  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
03327  *  forms will silently ignore the assignment.
03328  */
03329 
03330 static VALUE
03331 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03332 {
03333     if (argc == 3) {
03334         if (TYPE(argv[0]) == T_REGEXP) {
03335             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03336         }
03337         else {
03338             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03339         }
03340         return argv[2];
03341     }
03342     if (argc != 2) {
03343         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03344     }
03345     return rb_str_aset(str, argv[0], argv[1]);
03346 }
03347 
03348 /*
03349  *  call-seq:
03350  *     str.insert(index, other_str)   -> str
03351  *
03352  *  Inserts <i>other_str</i> before the character at the given
03353  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
03354  *  end of the string, and insert <em>after</em> the given character.
03355  *  The intent is insert <i>aString</i> so that it starts at the given
03356  *  <i>index</i>.
03357  *
03358  *     "abcd".insert(0, 'X')    #=> "Xabcd"
03359  *     "abcd".insert(3, 'X')    #=> "abcXd"
03360  *     "abcd".insert(4, 'X')    #=> "abcdX"
03361  *     "abcd".insert(-3, 'X')   #=> "abXcd"
03362  *     "abcd".insert(-1, 'X')   #=> "abcdX"
03363  */
03364 
03365 static VALUE
03366 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03367 {
03368     long pos = NUM2LONG(idx);
03369 
03370     if (pos == -1) {
03371         return rb_str_append(str, str2);
03372     }
03373     else if (pos < 0) {
03374         pos++;
03375     }
03376     rb_str_splice(str, pos, 0, str2);
03377     return str;
03378 }
03379 
03380 
03381 /*
03382  *  call-seq:
03383  *     str.slice!(fixnum)           -> fixnum or nil
03384  *     str.slice!(fixnum, fixnum)   -> new_str or nil
03385  *     str.slice!(range)            -> new_str or nil
03386  *     str.slice!(regexp)           -> new_str or nil
03387  *     str.slice!(other_str)        -> new_str or nil
03388  *
03389  *  Deletes the specified portion from <i>str</i>, and returns the portion
03390  *  deleted.
03391  *
03392  *     string = "this is a string"
03393  *     string.slice!(2)        #=> "i"
03394  *     string.slice!(3..6)     #=> " is "
03395  *     string.slice!(/s.*t/)   #=> "sa st"
03396  *     string.slice!("r")      #=> "r"
03397  *     string                  #=> "thing"
03398  */
03399 
03400 static VALUE
03401 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03402 {
03403     VALUE result;
03404     VALUE buf[3];
03405     int i;
03406 
03407     if (argc < 1 || 2 < argc) {
03408         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03409     }
03410     for (i=0; i<argc; i++) {
03411         buf[i] = argv[i];
03412     }
03413     str_modify_keep_cr(str);
03414     buf[i] = rb_str_new(0,0);
03415     result = rb_str_aref_m(argc, buf, str);
03416     if (!NIL_P(result)) {
03417         rb_str_aset_m(argc+1, buf, str);
03418     }
03419     return result;
03420 }
03421 
03422 static VALUE
03423 get_pat(VALUE pat, int quote)
03424 {
03425     VALUE val;
03426 
03427     switch (TYPE(pat)) {
03428       case T_REGEXP:
03429         return pat;
03430 
03431       case T_STRING:
03432         break;
03433 
03434       default:
03435         val = rb_check_string_type(pat);
03436         if (NIL_P(val)) {
03437             Check_Type(pat, T_REGEXP);
03438         }
03439         pat = val;
03440     }
03441 
03442     if (quote) {
03443         pat = rb_reg_quote(pat);
03444     }
03445 
03446     return rb_reg_regcomp(pat);
03447 }
03448 
03449 
03450 /*
03451  *  call-seq:
03452  *     str.sub!(pattern, replacement)          -> str or nil
03453  *     str.sub!(pattern) {|match| block }      -> str or nil
03454  *
03455  *  Performs the substitutions of <code>String#sub</code> in place,
03456  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
03457  *  performed.
03458  */
03459 
03460 static VALUE
03461 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03462 {
03463     VALUE pat, repl, hash = Qnil;
03464     int iter = 0;
03465     int tainted = 0;
03466     int untrusted = 0;
03467     long plen;
03468 
03469     if (argc == 1 && rb_block_given_p()) {
03470         iter = 1;
03471     }
03472     else if (argc == 2) {
03473         repl = argv[1];
03474         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03475         if (NIL_P(hash)) {
03476             StringValue(repl);
03477         }
03478         if (OBJ_TAINTED(repl)) tainted = 1;
03479         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03480     }
03481     else {
03482         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03483     }
03484 
03485     pat = get_pat(argv[0], 1);
03486     str_modifiable(str);
03487     if (rb_reg_search(pat, str, 0, 0) >= 0) {
03488         rb_encoding *enc;
03489         int cr = ENC_CODERANGE(str);
03490         VALUE match = rb_backref_get();
03491         struct re_registers *regs = RMATCH_REGS(match);
03492         long beg0 = BEG(0);
03493         long end0 = END(0);
03494         char *p, *rp;
03495         long len, rlen;
03496 
03497         if (iter || !NIL_P(hash)) {
03498             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03499 
03500             if (iter) {
03501                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03502             }
03503             else {
03504                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03505                 repl = rb_obj_as_string(repl);
03506             }
03507             str_mod_check(str, p, len);
03508             str_frozen_check(str);
03509         }
03510         else {
03511             repl = rb_reg_regsub(repl, str, regs, pat);
03512         }
03513         enc = rb_enc_compatible(str, repl);
03514         if (!enc) {
03515             rb_encoding *str_enc = STR_ENC_GET(str);
03516             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03517             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03518                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03519                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03520                          rb_enc_name(str_enc),
03521                          rb_enc_name(STR_ENC_GET(repl)));
03522             }
03523             enc = STR_ENC_GET(repl);
03524         }
03525         rb_str_modify(str);
03526         rb_enc_associate(str, enc);
03527         if (OBJ_TAINTED(repl)) tainted = 1;
03528         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03529         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03530             int cr2 = ENC_CODERANGE(repl);
03531             if (cr2 == ENC_CODERANGE_BROKEN ||
03532                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03533                 cr = ENC_CODERANGE_UNKNOWN;
03534             else
03535                 cr = cr2;
03536         }
03537         plen = end0 - beg0;
03538         rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03539         len = RSTRING_LEN(str);
03540         if (rlen > plen) {
03541             RESIZE_CAPA(str, len + rlen - plen);
03542         }
03543         p = RSTRING_PTR(str);
03544         if (rlen != plen) {
03545             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03546         }
03547         memcpy(p + beg0, rp, rlen);
03548         len += rlen - plen;
03549         STR_SET_LEN(str, len);
03550         RSTRING_PTR(str)[len] = '\0';
03551         ENC_CODERANGE_SET(str, cr);
03552         if (tainted) OBJ_TAINT(str);
03553         if (untrusted) OBJ_UNTRUST(str);
03554 
03555         return str;
03556     }
03557     return Qnil;
03558 }
03559 
03560 
03561 /*
03562  *  call-seq:
03563  *     str.sub(pattern, replacement)         -> new_str
03564  *     str.sub(pattern, hash)                -> new_str
03565  *     str.sub(pattern) {|match| block }     -> new_str
03566  *
03567  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
03568  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03569  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03570  *  regular expression metacharacters it contains will be interpreted
03571  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03572  *  instead of a digit.
03573  *
03574  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03575  *  the matched text. It may contain back-references to the pattern's capture
03576  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03577  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03578  *  double-quoted string, both back-references must be preceded by an
03579  *  additional backslash. However, within <i>replacement</i> the special match
03580  *  variables, such as <code>&$</code>, will not refer to the current match.
03581  *
03582  *  If the second argument is a <code>Hash</code>, and the matched text is one
03583  *  of its keys, the corresponding value is the replacement string.
03584  *
03585  *  In the block form, the current match string is passed in as a parameter,
03586  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03587  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03588  *  returned by the block will be substituted for the match on each call.
03589  *
03590  *  The result inherits any tainting in the original string or any supplied
03591  *  replacement string.
03592  *
03593  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
03594  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
03595  *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
03596  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
03597  *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
03598  *      #=> "Is /bin/bash your preferred shell?"
03599  */
03600 
03601 static VALUE
03602 rb_str_sub(int argc, VALUE *argv, VALUE str)
03603 {
03604     str = rb_str_dup(str);
03605     rb_str_sub_bang(argc, argv, str);
03606     return str;
03607 }
03608 
03609 static VALUE
03610 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03611 {
03612     VALUE pat, val, repl, match, dest, hash = Qnil;
03613     struct re_registers *regs;
03614     long beg, n;
03615     long beg0, end0;
03616     long offset, blen, slen, len, last;
03617     int iter = 0;
03618     char *sp, *cp;
03619     int tainted = 0;
03620     rb_encoding *str_enc;
03621 
03622     switch (argc) {
03623       case 1:
03624         RETURN_ENUMERATOR(str, argc, argv);
03625         iter = 1;
03626         break;
03627       case 2:
03628         repl = argv[1];
03629         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03630         if (NIL_P(hash)) {
03631             StringValue(repl);
03632         }
03633         if (OBJ_TAINTED(repl)) tainted = 1;
03634         break;
03635       default:
03636         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03637     }
03638 
03639     pat = get_pat(argv[0], 1);
03640     beg = rb_reg_search(pat, str, 0, 0);
03641     if (beg < 0) {
03642         if (bang) return Qnil;  /* no match, no substitution */
03643         return rb_str_dup(str);
03644     }
03645 
03646     offset = 0;
03647     n = 0;
03648     blen = RSTRING_LEN(str) + 30; /* len + margin */
03649     dest = rb_str_buf_new(blen);
03650     sp = RSTRING_PTR(str);
03651     slen = RSTRING_LEN(str);
03652     cp = sp;
03653     str_enc = STR_ENC_GET(str);
03654 
03655     do {
03656         n++;
03657         match = rb_backref_get();
03658         regs = RMATCH_REGS(match);
03659         beg0 = BEG(0);
03660         end0 = END(0);
03661         if (iter || !NIL_P(hash)) {
03662             if (iter) {
03663                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03664             }
03665             else {
03666                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03667                 val = rb_obj_as_string(val);
03668             }
03669             str_mod_check(str, sp, slen);
03670             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
03671                 rb_raise(rb_eRuntimeError, "block should not cheat");
03672             }
03673         }
03674         else {
03675             val = rb_reg_regsub(repl, str, regs, pat);
03676         }
03677 
03678         if (OBJ_TAINTED(val)) tainted = 1;
03679 
03680         len = beg - offset;     /* copy pre-match substr */
03681         if (len) {
03682             rb_enc_str_buf_cat(dest, cp, len, str_enc);
03683         }
03684 
03685         rb_str_buf_append(dest, val);
03686 
03687         last = offset;
03688         offset = end0;
03689         if (beg0 == end0) {
03690             /*
03691              * Always consume at least one character of the input string
03692              * in order to prevent infinite loops.
03693              */
03694             if (RSTRING_LEN(str) <= end0) break;
03695             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03696             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03697             offset = end0 + len;
03698         }
03699         cp = RSTRING_PTR(str) + offset;
03700         if (offset > RSTRING_LEN(str)) break;
03701         beg = rb_reg_search(pat, str, offset, 0);
03702     } while (beg >= 0);
03703     if (RSTRING_LEN(str) > offset) {
03704         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03705     }
03706     rb_reg_search(pat, str, last, 0);
03707     if (bang) {
03708         rb_str_shared_replace(str, dest);
03709     }
03710     else {
03711         RBASIC(dest)->klass = rb_obj_class(str);
03712         OBJ_INFECT(dest, str);
03713         str = dest;
03714     }
03715 
03716     if (tainted) OBJ_TAINT(str);
03717     return str;
03718 }
03719 
03720 
03721 /*
03722  *  call-seq:
03723  *     str.gsub!(pattern, replacement)        -> str or nil
03724  *     str.gsub!(pattern) {|match| block }    -> str or nil
03725  *     str.gsub!(pattern)                     -> an_enumerator
03726  *
03727  *  Performs the substitutions of <code>String#gsub</code> in place, returning
03728  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
03729  *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
03730  */
03731 
03732 static VALUE
03733 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03734 {
03735     str_modify_keep_cr(str);
03736     return str_gsub(argc, argv, str, 1);
03737 }
03738 
03739 
03740 /*
03741  *  call-seq:
03742  *     str.gsub(pattern, replacement)       -> new_str
03743  *     str.gsub(pattern, hash)              -> new_str
03744  *     str.gsub(pattern) {|match| block }   -> new_str
03745  *     str.gsub(pattern)                    -> enumerator
03746  *
03747  *  Returns a copy of <i>str</i> with the <em>all</em> occurrences of
03748  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03749  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03750  *  regular expression metacharacters it contains will be interpreted
03751  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03752  *  instead of a digit.
03753  *
03754  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03755  *  the matched text. It may contain back-references to the pattern's capture
03756  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03757  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03758  *  double-quoted string, both back-references must be preceded by an
03759  *  additional backslash. However, within <i>replacement</i> the special match
03760  *  variables, such as <code>&$</code>, will not refer to the current match.
03761  *
03762  *  If the second argument is a <code>Hash</code>, and the matched text is one
03763  *  of its keys, the corresponding value is the replacement string.
03764  *
03765  *  In the block form, the current match string is passed in as a parameter,
03766  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03767  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03768  *  returned by the block will be substituted for the match on each call.
03769  *
03770  *  The result inherits any tainting in the original string or any supplied
03771  *  replacement string.
03772  *
03773  *  When neither a block nor a second argument is supplied, an
03774  *  <code>Enumerator</code> is returned.
03775  *
03776  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
03777  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
03778  *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
03779  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
03780  *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
03781  */
03782 
03783 static VALUE
03784 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03785 {
03786     return str_gsub(argc, argv, str, 0);
03787 }
03788 
03789 
03790 /*
03791  *  call-seq:
03792  *     str.replace(other_str)   -> str
03793  *
03794  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
03795  *  values in <i>other_str</i>.
03796  *
03797  *     s = "hello"         #=> "hello"
03798  *     s.replace "world"   #=> "world"
03799  */
03800 
03801 VALUE
03802 rb_str_replace(VALUE str, VALUE str2)
03803 {
03804     str_modifiable(str);
03805     if (str == str2) return str;
03806 
03807     StringValue(str2);
03808     str_discard(str);
03809     return str_replace(str, str2);
03810 }
03811 
03812 /*
03813  *  call-seq:
03814  *     string.clear    ->  string
03815  *
03816  *  Makes string empty.
03817  *
03818  *     a = "abcde"
03819  *     a.clear    #=> ""
03820  */
03821 
03822 static VALUE
03823 rb_str_clear(VALUE str)
03824 {
03825     str_discard(str);
03826     STR_SET_EMBED(str);
03827     STR_SET_EMBED_LEN(str, 0);
03828     RSTRING_PTR(str)[0] = 0;
03829     if (rb_enc_asciicompat(STR_ENC_GET(str)))
03830         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03831     else
03832         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03833     return str;
03834 }
03835 
03836 /*
03837  *  call-seq:
03838  *     string.chr    ->  string
03839  *
03840  *  Returns a one-character string at the beginning of the string.
03841  *
03842  *     a = "abcde"
03843  *     a.chr    #=> "a"
03844  */
03845 
03846 static VALUE
03847 rb_str_chr(VALUE str)
03848 {
03849     return rb_str_substr(str, 0, 1);
03850 }
03851 
03852 /*
03853  *  call-seq:
03854  *     str.getbyte(index)          -> 0 .. 255
03855  *
03856  *  returns the <i>index</i>th byte as an integer.
03857  */
03858 static VALUE
03859 rb_str_getbyte(VALUE str, VALUE index)
03860 {
03861     long pos = NUM2LONG(index);
03862 
03863     if (pos < 0)
03864         pos += RSTRING_LEN(str);
03865     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
03866         return Qnil;
03867 
03868     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03869 }
03870 
03871 /*
03872  *  call-seq:
03873  *     str.setbyte(index, int) -> int
03874  *
03875  *  modifies the <i>index</i>th byte as <i>int</i>.
03876  */
03877 static VALUE
03878 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
03879 {
03880     long pos = NUM2LONG(index);
03881     int byte = NUM2INT(value);
03882 
03883     rb_str_modify(str);
03884 
03885     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
03886         rb_raise(rb_eIndexError, "index %ld out of string", pos);
03887     if (pos < 0)
03888         pos += RSTRING_LEN(str);
03889 
03890     RSTRING_PTR(str)[pos] = byte;
03891 
03892     return value;
03893 }
03894 
03895 /*
03896  *  call-seq:
03897  *     str.reverse   -> new_str
03898  *
03899  *  Returns a new string with the characters from <i>str</i> in reverse order.
03900  *
03901  *     "stressed".reverse   #=> "desserts"
03902  */
03903 
03904 static VALUE
03905 rb_str_reverse(VALUE str)
03906 {
03907     rb_encoding *enc;
03908     VALUE rev;
03909     char *s, *e, *p;
03910     int single = 1;
03911 
03912     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
03913     enc = STR_ENC_GET(str);
03914     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
03915     s = RSTRING_PTR(str); e = RSTRING_END(str);
03916     p = RSTRING_END(rev);
03917 
03918     if (RSTRING_LEN(str) > 1) {
03919         if (single_byte_optimizable(str)) {
03920             while (s < e) {
03921                 *--p = *s++;
03922             }
03923         }
03924         else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
03925             while (s < e) {
03926                 int clen = rb_enc_fast_mbclen(s, e, enc);
03927 
03928                 if (clen > 1 || (*s & 0x80)) single = 0;
03929                 p -= clen;
03930                 memcpy(p, s, clen);
03931                 s += clen;
03932             }
03933         }
03934         else {
03935             while (s < e) {
03936                 int clen = rb_enc_mbclen(s, e, enc);
03937 
03938                 if (clen > 1 || (*s & 0x80)) single = 0;
03939                 p -= clen;
03940                 memcpy(p, s, clen);
03941                 s += clen;
03942             }
03943         }
03944     }
03945     STR_SET_LEN(rev, RSTRING_LEN(str));
03946     OBJ_INFECT(rev, str);
03947     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
03948         if (single) {
03949             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03950         }
03951         else {
03952             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03953         }
03954     }
03955     rb_enc_cr_str_copy_for_substr(rev, str);
03956 
03957     return rev;
03958 }
03959 
03960 
03961 /*
03962  *  call-seq:
03963  *     str.reverse!   -> str
03964  *
03965  *  Reverses <i>str</i> in place.
03966  */
03967 
03968 static VALUE
03969 rb_str_reverse_bang(VALUE str)
03970 {
03971     if (RSTRING_LEN(str) > 1) {
03972         if (single_byte_optimizable(str)) {
03973             char *s, *e, c;
03974 
03975             str_modify_keep_cr(str);
03976             s = RSTRING_PTR(str);
03977             e = RSTRING_END(str) - 1;
03978             while (s < e) {
03979                 c = *s;
03980                 *s++ = *e;
03981                 *e-- = c;
03982             }
03983         }
03984         else {
03985             rb_str_shared_replace(str, rb_str_reverse(str));
03986         }
03987     }
03988     else {
03989         str_modify_keep_cr(str);
03990     }
03991     return str;
03992 }
03993 
03994 
03995 /*
03996  *  call-seq:
03997  *     str.include? other_str   -> true or false
03998  *
03999  *  Returns <code>true</code> if <i>str</i> contains the given string or
04000  *  character.
04001  *
04002  *     "hello".include? "lo"   #=> true
04003  *     "hello".include? "ol"   #=> false
04004  *     "hello".include? ?h     #=> true
04005  */
04006 
04007 static VALUE
04008 rb_str_include(VALUE str, VALUE arg)
04009 {
04010     long i;
04011 
04012     StringValue(arg);
04013     i = rb_str_index(str, arg, 0);
04014 
04015     if (i == -1) return Qfalse;
04016     return Qtrue;
04017 }
04018 
04019 
04020 /*
04021  *  call-seq:
04022  *     str.to_i(base=10)   -> integer
04023  *
04024  *  Returns the result of interpreting leading characters in <i>str</i> as an
04025  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
04026  *  end of a valid number are ignored. If there is not a valid number at the
04027  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
04028  *  exception when <i>base</i> is valid.
04029  *
04030  *     "12345".to_i             #=> 12345
04031  *     "99 red balloons".to_i   #=> 99
04032  *     "0a".to_i                #=> 0
04033  *     "0a".to_i(16)            #=> 10
04034  *     "hello".to_i             #=> 0
04035  *     "1100101".to_i(2)        #=> 101
04036  *     "1100101".to_i(8)        #=> 294977
04037  *     "1100101".to_i(10)       #=> 1100101
04038  *     "1100101".to_i(16)       #=> 17826049
04039  */
04040 
04041 static VALUE
04042 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04043 {
04044     int base;
04045 
04046     if (argc == 0) base = 10;
04047     else {
04048         VALUE b;
04049 
04050         rb_scan_args(argc, argv, "01", &b);
04051         base = NUM2INT(b);
04052     }
04053     if (base < 0) {
04054         rb_raise(rb_eArgError, "invalid radix %d", base);
04055     }
04056     return rb_str_to_inum(str, base, FALSE);
04057 }
04058 
04059 
04060 /*
04061  *  call-seq:
04062  *     str.to_f   -> float
04063  *
04064  *  Returns the result of interpreting leading characters in <i>str</i> as a
04065  *  floating point number. Extraneous characters past the end of a valid number
04066  *  are ignored. If there is not a valid number at the start of <i>str</i>,
04067  *  <code>0.0</code> is returned. This method never raises an exception.
04068  *
04069  *     "123.45e1".to_f        #=> 1234.5
04070  *     "45.67 degrees".to_f   #=> 45.67
04071  *     "thx1138".to_f         #=> 0.0
04072  */
04073 
04074 static VALUE
04075 rb_str_to_f(VALUE str)
04076 {
04077     return DBL2NUM(rb_str_to_dbl(str, FALSE));
04078 }
04079 
04080 
04081 /*
04082  *  call-seq:
04083  *     str.to_s     -> str
04084  *     str.to_str   -> str
04085  *
04086  *  Returns the receiver.
04087  */
04088 
04089 static VALUE
04090 rb_str_to_s(VALUE str)
04091 {
04092     if (rb_obj_class(str) != rb_cString) {
04093         return str_duplicate(rb_cString, str);
04094     }
04095     return str;
04096 }
04097 
04098 #if 0
04099 static void
04100 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04101 {
04102     char s[RUBY_MAX_CHAR_LEN];
04103     int n = rb_enc_codelen(c, enc);
04104 
04105     rb_enc_mbcput(c, s, enc);
04106     rb_enc_str_buf_cat(str, s, n, enc);
04107 }
04108 #endif
04109 
04110 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
04111 
04112 int
04113 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04114 {
04115     char buf[CHAR_ESC_LEN + 1];
04116     int l;
04117 
04118 #if SIZEOF_INT > 4
04119     c &= 0xffffffff;
04120 #endif
04121     if (unicode_p) {
04122         if (c < 0x7F && ISPRINT(c)) {
04123             snprintf(buf, CHAR_ESC_LEN, "%c", c);
04124         }
04125         else if (c < 0x10000) {
04126             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04127         }
04128         else {
04129             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04130         }
04131     }
04132     else {
04133         if (c < 0x100) {
04134             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04135         }
04136         else {
04137             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04138         }
04139     }
04140     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
04141     rb_str_buf_cat(result, buf, l);
04142     return l;
04143 }
04144 
04145 /*
04146  * call-seq:
04147  *   str.inspect   -> string
04148  *
04149  * Returns a printable version of _str_, surrounded by quote marks,
04150  * with special characters escaped.
04151  *
04152  *    str = "hello"
04153  *    str[3] = "\b"
04154  *    str.inspect       #=> "\"hel\\bo\""
04155  */
04156 
04157 VALUE
04158 rb_str_inspect(VALUE str)
04159 {
04160     rb_encoding *enc = STR_ENC_GET(str);
04161     const char *p, *pend, *prev;
04162     char buf[CHAR_ESC_LEN + 1];
04163     VALUE result = rb_str_buf_new(0);
04164     rb_encoding *resenc = rb_default_internal_encoding();
04165     int unicode_p = rb_enc_unicode_p(enc);
04166     int asciicompat = rb_enc_asciicompat(enc);
04167 
04168     if (resenc == NULL) resenc = rb_default_external_encoding();
04169     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04170     rb_enc_associate(result, resenc);
04171     str_buf_cat2(result, "\"");
04172 
04173     p = RSTRING_PTR(str); pend = RSTRING_END(str);
04174     prev = p;
04175     while (p < pend) {
04176         unsigned int c, cc;
04177         int n;
04178 
04179         n = rb_enc_precise_mbclen(p, pend, enc);
04180         if (!MBCLEN_CHARFOUND_P(n)) {
04181             if (p > prev) str_buf_cat(result, prev, p - prev);
04182             n = rb_enc_mbminlen(enc);
04183             if (pend < p + n)
04184                 n = (int)(pend - p);
04185             while (n--) {
04186                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04187                 str_buf_cat(result, buf, strlen(buf));
04188                 prev = ++p;
04189             }
04190             continue;
04191         }
04192         n = MBCLEN_CHARFOUND_LEN(n);
04193         c = rb_enc_mbc_to_codepoint(p, pend, enc);
04194         p += n;
04195         if (c == '"'|| c == '\\' ||
04196             (c == '#' &&
04197              p < pend &&
04198              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04199              (cc = rb_enc_codepoint(p,pend,enc),
04200               (cc == '$' || cc == '@' || cc == '{')))) {
04201             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04202             str_buf_cat2(result, "\\");
04203             if (asciicompat || enc == resenc) {
04204                 prev = p - n;
04205                 continue;
04206             }
04207         }
04208         switch (c) {
04209           case '\n': cc = 'n'; break;
04210           case '\r': cc = 'r'; break;
04211           case '\t': cc = 't'; break;
04212           case '\f': cc = 'f'; break;
04213           case '\013': cc = 'v'; break;
04214           case '\010': cc = 'b'; break;
04215           case '\007': cc = 'a'; break;
04216           case 033: cc = 'e'; break;
04217           default: cc = 0; break;
04218         }
04219         if (cc) {
04220             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04221             buf[0] = '\\';
04222             buf[1] = (char)cc;
04223             str_buf_cat(result, buf, 2);
04224             prev = p;
04225             continue;
04226         }
04227         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04228             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04229             continue;
04230         }
04231         else {
04232             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04233             rb_str_buf_cat_escaped_char(result, c, unicode_p);
04234             prev = p;
04235             continue;
04236         }
04237     }
04238     if (p > prev) str_buf_cat(result, prev, p - prev);
04239     str_buf_cat2(result, "\"");
04240 
04241     OBJ_INFECT(result, str);
04242     return result;
04243 }
04244 
04245 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04246 
04247 /*
04248  *  call-seq:
04249  *     str.dump   -> new_str
04250  *
04251  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
04252  *  <code>\nnn</code> notation and all special characters escaped.
04253  */
04254 
04255 VALUE
04256 rb_str_dump(VALUE str)
04257 {
04258     rb_encoding *enc = rb_enc_get(str);
04259     long len;
04260     const char *p, *pend;
04261     char *q, *qend;
04262     VALUE result;
04263     int u8 = (enc == rb_utf8_encoding());
04264 
04265     len = 2;                    /* "" */
04266     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04267     while (p < pend) {
04268         unsigned char c = *p++;
04269         switch (c) {
04270           case '"':  case '\\':
04271           case '\n': case '\r':
04272           case '\t': case '\f':
04273           case '\013': case '\010': case '\007': case '\033':
04274             len += 2;
04275             break;
04276 
04277           case '#':
04278             len += IS_EVSTR(p, pend) ? 2 : 1;
04279             break;
04280 
04281           default:
04282             if (ISPRINT(c)) {
04283                 len++;
04284             }
04285             else {
04286                 if (u8) {       /* \u{NN} */
04287                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
04288                     if (MBCLEN_CHARFOUND_P(n-1)) {
04289                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04290                         while (cc >>= 4) len++;
04291                         len += 5;
04292                         p += MBCLEN_CHARFOUND_LEN(n)-1;
04293                         break;
04294                     }
04295                 }
04296                 len += 4;       /* \xNN */
04297             }
04298             break;
04299         }
04300     }
04301     if (!rb_enc_asciicompat(enc)) {
04302         len += 19;              /* ".force_encoding('')" */
04303         len += strlen(enc->name);
04304     }
04305 
04306     result = rb_str_new5(str, 0, len);
04307     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04308     q = RSTRING_PTR(result); qend = q + len + 1;
04309 
04310     *q++ = '"';
04311     while (p < pend) {
04312         unsigned char c = *p++;
04313 
04314         if (c == '"' || c == '\\') {
04315             *q++ = '\\';
04316             *q++ = c;
04317         }
04318         else if (c == '#') {
04319             if (IS_EVSTR(p, pend)) *q++ = '\\';
04320             *q++ = '#';
04321         }
04322         else if (c == '\n') {
04323             *q++ = '\\';
04324             *q++ = 'n';
04325         }
04326         else if (c == '\r') {
04327             *q++ = '\\';
04328             *q++ = 'r';
04329         }
04330         else if (c == '\t') {
04331             *q++ = '\\';
04332             *q++ = 't';
04333         }
04334         else if (c == '\f') {
04335             *q++ = '\\';
04336             *q++ = 'f';
04337         }
04338         else if (c == '\013') {
04339             *q++ = '\\';
04340             *q++ = 'v';
04341         }
04342         else if (c == '\010') {
04343             *q++ = '\\';
04344             *q++ = 'b';
04345         }
04346         else if (c == '\007') {
04347             *q++ = '\\';
04348             *q++ = 'a';
04349         }
04350         else if (c == '\033') {
04351             *q++ = '\\';
04352             *q++ = 'e';
04353         }
04354         else if (ISPRINT(c)) {
04355             *q++ = c;
04356         }
04357         else {
04358             *q++ = '\\';
04359             if (u8) {
04360                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04361                 if (MBCLEN_CHARFOUND_P(n)) {
04362                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04363                     p += n;
04364                     snprintf(q, qend-q, "u{%x}", cc);
04365                     q += strlen(q);
04366                     continue;
04367                 }
04368             }
04369             snprintf(q, qend-q, "x%02X", c);
04370             q += 3;
04371         }
04372     }
04373     *q++ = '"';
04374     *q = '\0';
04375     if (!rb_enc_asciicompat(enc)) {
04376         snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04377         enc = rb_ascii8bit_encoding();
04378     }
04379     OBJ_INFECT(result, str);
04380     /* result from dump is ASCII */
04381     rb_enc_associate(result, enc);
04382     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04383     return result;
04384 }
04385 
04386 
04387 static void
04388 rb_str_check_dummy_enc(rb_encoding *enc)
04389 {
04390     if (rb_enc_dummy_p(enc)) {
04391         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04392                  rb_enc_name(enc));
04393     }
04394 }
04395 
04396 /*
04397  *  call-seq:
04398  *     str.upcase!   -> str or nil
04399  *
04400  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
04401  *  were made.
04402  *  Note: case replacement is effective only in ASCII region.
04403  */
04404 
04405 static VALUE
04406 rb_str_upcase_bang(VALUE str)
04407 {
04408     rb_encoding *enc;
04409     char *s, *send;
04410     int modify = 0;
04411     int n;
04412 
04413     str_modify_keep_cr(str);
04414     enc = STR_ENC_GET(str);
04415     rb_str_check_dummy_enc(enc);
04416     s = RSTRING_PTR(str); send = RSTRING_END(str);
04417     if (single_byte_optimizable(str)) {
04418         while (s < send) {
04419             unsigned int c = *(unsigned char*)s;
04420 
04421             if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04422                 *s = 'A' + (c - 'a');
04423                 modify = 1;
04424             }
04425             s++;
04426         }
04427     }
04428     else {
04429         int ascompat = rb_enc_asciicompat(enc);
04430 
04431         while (s < send) {
04432             unsigned int c;
04433 
04434             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04435                 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04436                     *s = 'A' + (c - 'a');
04437                     modify = 1;
04438                 }
04439                 s++;
04440             }
04441             else {
04442                 c = rb_enc_codepoint_len(s, send, &n, enc);
04443                 if (rb_enc_islower(c, enc)) {
04444                     /* assuming toupper returns codepoint with same size */
04445                     rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04446                     modify = 1;
04447                 }
04448                 s += n;
04449             }
04450         }
04451     }
04452 
04453     if (modify) return str;
04454     return Qnil;
04455 }
04456 
04457 
04458 /*
04459  *  call-seq:
04460  *     str.upcase   -> new_str
04461  *
04462  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
04463  *  uppercase counterparts. The operation is locale insensitive---only
04464  *  characters ``a'' to ``z'' are affected.
04465  *  Note: case replacement is effective only in ASCII region.
04466  *
04467  *     "hEllO".upcase   #=> "HELLO"
04468  */
04469 
04470 static VALUE
04471 rb_str_upcase(VALUE str)
04472 {
04473     str = rb_str_dup(str);
04474     rb_str_upcase_bang(str);
04475     return str;
04476 }
04477 
04478 
04479 /*
04480  *  call-seq:
04481  *     str.downcase!   -> str or nil
04482  *
04483  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
04484  *  changes were made.
04485  *  Note: case replacement is effective only in ASCII region.
04486  */
04487 
04488 static VALUE
04489 rb_str_downcase_bang(VALUE str)
04490 {
04491     rb_encoding *enc;
04492     char *s, *send;
04493     int modify = 0;
04494 
04495     str_modify_keep_cr(str);
04496     enc = STR_ENC_GET(str);
04497     rb_str_check_dummy_enc(enc);
04498     s = RSTRING_PTR(str); send = RSTRING_END(str);
04499     if (single_byte_optimizable(str)) {
04500         while (s < send) {
04501             unsigned int c = *(unsigned char*)s;
04502 
04503             if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04504                 *s = 'a' + (c - 'A');
04505                 modify = 1;
04506             }
04507             s++;
04508         }
04509     }
04510     else {
04511         int ascompat = rb_enc_asciicompat(enc);
04512 
04513         while (s < send) {
04514             unsigned int c;
04515             int n;
04516 
04517             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04518                 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04519                     *s = 'a' + (c - 'A');
04520                     modify = 1;
04521                 }
04522                 s++;
04523             }
04524             else {
04525                 c = rb_enc_codepoint_len(s, send, &n, enc);
04526                 if (rb_enc_isupper(c, enc)) {
04527                     /* assuming toupper returns codepoint with same size */
04528                     rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04529                     modify = 1;
04530                 }
04531                 s += n;
04532             }
04533         }
04534     }
04535 
04536     if (modify) return str;
04537     return Qnil;
04538 }
04539 
04540 
04541 /*
04542  *  call-seq:
04543  *     str.downcase   -> new_str
04544  *
04545  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
04546  *  lowercase counterparts. The operation is locale insensitive---only
04547  *  characters ``A'' to ``Z'' are affected.
04548  *  Note: case replacement is effective only in ASCII region.
04549  *
04550  *     "hEllO".downcase   #=> "hello"
04551  */
04552 
04553 static VALUE
04554 rb_str_downcase(VALUE str)
04555 {
04556     str = rb_str_dup(str);
04557     rb_str_downcase_bang(str);
04558     return str;
04559 }
04560 
04561 
04562 /*
04563  *  call-seq:
04564  *     str.capitalize!   -> str or nil
04565  *
04566  *  Modifies <i>str</i> by converting the first character to uppercase and the
04567  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
04568  *  Note: case conversion is effective only in ASCII region.
04569  *
04570  *     a = "hello"
04571  *     a.capitalize!   #=> "Hello"
04572  *     a               #=> "Hello"
04573  *     a.capitalize!   #=> nil
04574  */
04575 
04576 static VALUE
04577 rb_str_capitalize_bang(VALUE str)
04578 {
04579     rb_encoding *enc;
04580     char *s, *send;
04581     int modify = 0;
04582     unsigned int c;
04583     int n;
04584 
04585     str_modify_keep_cr(str);
04586     enc = STR_ENC_GET(str);
04587     rb_str_check_dummy_enc(enc);
04588     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04589     s = RSTRING_PTR(str); send = RSTRING_END(str);
04590 
04591     c = rb_enc_codepoint_len(s, send, &n, enc);
04592     if (rb_enc_islower(c, enc)) {
04593         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04594         modify = 1;
04595     }
04596     s += n;
04597     while (s < send) {
04598         c = rb_enc_codepoint_len(s, send, &n, enc);
04599         if (rb_enc_isupper(c, enc)) {
04600             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04601             modify = 1;
04602         }
04603         s += n;
04604     }
04605 
04606     if (modify) return str;
04607     return Qnil;
04608 }
04609 
04610 
04611 /*
04612  *  call-seq:
04613  *     str.capitalize   -> new_str
04614  *
04615  *  Returns a copy of <i>str</i> with the first character converted to uppercase
04616  *  and the remainder to lowercase.
04617  *  Note: case conversion is effective only in ASCII region.
04618  *
04619  *     "hello".capitalize    #=> "Hello"
04620  *     "HELLO".capitalize    #=> "Hello"
04621  *     "123ABC".capitalize   #=> "123abc"
04622  */
04623 
04624 static VALUE
04625 rb_str_capitalize(VALUE str)
04626 {
04627     str = rb_str_dup(str);
04628     rb_str_capitalize_bang(str);
04629     return str;
04630 }
04631 
04632 
04633 /*
04634  *  call-seq:
04635 *     str.swapcase!   -> str or nil
04636  *
04637  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
04638  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
04639  *  Note: case conversion is effective only in ASCII region.
04640  */
04641 
04642 static VALUE
04643 rb_str_swapcase_bang(VALUE str)
04644 {
04645     rb_encoding *enc;
04646     char *s, *send;
04647     int modify = 0;
04648     int n;
04649 
04650     str_modify_keep_cr(str);
04651     enc = STR_ENC_GET(str);
04652     rb_str_check_dummy_enc(enc);
04653     s = RSTRING_PTR(str); send = RSTRING_END(str);
04654     while (s < send) {
04655         unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04656 
04657         if (rb_enc_isupper(c, enc)) {
04658             /* assuming toupper returns codepoint with same size */
04659             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04660             modify = 1;
04661         }
04662         else if (rb_enc_islower(c, enc)) {
04663             /* assuming tolower returns codepoint with same size */
04664             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04665             modify = 1;
04666         }
04667         s += n;
04668     }
04669 
04670     if (modify) return str;
04671     return Qnil;
04672 }
04673 
04674 
04675 /*
04676  *  call-seq:
04677  *     str.swapcase   -> new_str
04678  *
04679  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
04680  *  to lowercase and lowercase characters converted to uppercase.
04681  *  Note: case conversion is effective only in ASCII region.
04682  *
04683  *     "Hello".swapcase          #=> "hELLO"
04684  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
04685  */
04686 
04687 static VALUE
04688 rb_str_swapcase(VALUE str)
04689 {
04690     str = rb_str_dup(str);
04691     rb_str_swapcase_bang(str);
04692     return str;
04693 }
04694 
04695 typedef unsigned char *USTR;
04696 
04697 struct tr {
04698     int gen;
04699     unsigned int now, max;
04700     char *p, *pend;
04701 };
04702 
04703 static unsigned int
04704 trnext(struct tr *t, rb_encoding *enc)
04705 {
04706     int n;
04707 
04708     for (;;) {
04709         if (!t->gen) {
04710             if (t->p == t->pend) return -1;
04711             if (t->p < t->pend - 1 && *t->p == '\\') {
04712                 t->p++;
04713             }
04714             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04715             t->p += n;
04716             if (t->p < t->pend - 1 && *t->p == '-') {
04717                 t->p++;
04718                 if (t->p < t->pend) {
04719                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04720                     t->p += n;
04721                     if (t->now > c) {
04722                         if (t->now < 0x80 && c < 0x80) {
04723                             rb_raise(rb_eArgError,
04724                                      "invalid range \"%c-%c\" in string transliteration",
04725                                      t->now, c);
04726                         }
04727                         else {
04728                             rb_raise(rb_eArgError, "invalid range in string transliteration");
04729                         }
04730                         continue; /* not reached */
04731                     }
04732                     t->gen = 1;
04733                     t->max = c;
04734                 }
04735             }
04736             return t->now;
04737         }
04738         else if (++t->now < t->max) {
04739             return t->now;
04740         }
04741         else {
04742             t->gen = 0;
04743             return t->max;
04744         }
04745     }
04746 }
04747 
04748 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
04749 
04750 static VALUE
04751 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
04752 {
04753     const unsigned int errc = -1;
04754     unsigned int trans[256];
04755     rb_encoding *enc, *e1, *e2;
04756     struct tr trsrc, trrepl;
04757     int cflag = 0;
04758     unsigned int c, c0;
04759     int last = 0, modify = 0, i, l;
04760     char *s, *send;
04761     VALUE hash = 0;
04762     int singlebyte = single_byte_optimizable(str);
04763     int cr;
04764 
04765 #define CHECK_IF_ASCII(c) \
04766     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
04767            (cr = ENC_CODERANGE_VALID) : 0)
04768 
04769     StringValue(src);
04770     StringValue(repl);
04771     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04772     if (RSTRING_LEN(repl) == 0) {
04773         return rb_str_delete_bang(1, &src, str);
04774     }
04775 
04776     cr = ENC_CODERANGE(str);
04777     e1 = rb_enc_check(str, src);
04778     e2 = rb_enc_check(str, repl);
04779     if (e1 == e2) {
04780         enc = e1;
04781     }
04782     else {
04783         enc = rb_enc_check(src, repl);
04784     }
04785     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
04786     if (RSTRING_LEN(src) > 1 &&
04787         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
04788         trsrc.p + l < trsrc.pend) {
04789         cflag = 1;
04790         trsrc.p += l;
04791     }
04792     trrepl.p = RSTRING_PTR(repl);
04793     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
04794     trsrc.gen = trrepl.gen = 0;
04795     trsrc.now = trrepl.now = 0;
04796     trsrc.max = trrepl.max = 0;
04797 
04798     if (cflag) {
04799         for (i=0; i<256; i++) {
04800             trans[i] = 1;
04801         }
04802         while ((c = trnext(&trsrc, enc)) != errc) {
04803             if (c < 256) {
04804                 trans[c] = errc;
04805             }
04806             else {
04807                 if (!hash) hash = rb_hash_new();
04808                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
04809             }
04810         }
04811         while ((c = trnext(&trrepl, enc)) != errc)
04812             /* retrieve last replacer */;
04813         last = trrepl.now;
04814         for (i=0; i<256; i++) {
04815             if (trans[i] != errc) {
04816                 trans[i] = last;
04817             }
04818         }
04819     }
04820     else {
04821         unsigned int r;
04822 
04823         for (i=0; i<256; i++) {
04824             trans[i] = errc;
04825         }
04826         while ((c = trnext(&trsrc, enc)) != errc) {
04827             r = trnext(&trrepl, enc);
04828             if (r == errc) r = trrepl.now;
04829             if (c < 256) {
04830                 trans[c] = r;
04831                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
04832             }
04833             else {
04834                 if (!hash) hash = rb_hash_new();
04835                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
04836             }
04837         }
04838     }
04839 
04840     if (cr == ENC_CODERANGE_VALID)
04841         cr = ENC_CODERANGE_7BIT;
04842     str_modify_keep_cr(str);
04843     s = RSTRING_PTR(str); send = RSTRING_END(str);
04844     if (sflag) {
04845         int clen, tlen;
04846         long offset, max = RSTRING_LEN(str);
04847         unsigned int save = -1;
04848         char *buf = ALLOC_N(char, max), *t = buf;
04849 
04850         while (s < send) {
04851             int may_modify = 0;
04852 
04853             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04854             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04855 
04856             s += clen;
04857             if (c < 256) {
04858                 c = trans[c];
04859             }
04860             else if (hash) {
04861                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04862                 if (NIL_P(tmp)) {
04863                     if (cflag) c = last;
04864                     else c = errc;
04865                 }
04866                 else if (cflag) c = errc;
04867                 else c = NUM2INT(tmp);
04868             }
04869             else {
04870                 c = errc;
04871             }
04872             if (c != (unsigned int)-1) {
04873                 if (save == c) {
04874                     CHECK_IF_ASCII(c);
04875                     continue;
04876                 }
04877                 save = c;
04878                 tlen = rb_enc_codelen(c, enc);
04879                 modify = 1;
04880             }
04881             else {
04882                 save = -1;
04883                 c = c0;
04884                 if (enc != e1) may_modify = 1;
04885             }
04886             while (t - buf + tlen >= max) {
04887                 offset = t - buf;
04888                 max *= 2;
04889                 REALLOC_N(buf, char, max);
04890                 t = buf + offset;
04891             }
04892             rb_enc_mbcput(c, t, enc);
04893             if (may_modify && memcmp(s, t, tlen) != 0) {
04894                 modify = 1;
04895             }
04896             CHECK_IF_ASCII(c);
04897             t += tlen;
04898         }
04899         if (!STR_EMBED_P(str)) {
04900             xfree(RSTRING(str)->as.heap.ptr);
04901         }
04902         *t = '\0';
04903         RSTRING(str)->as.heap.ptr = buf;
04904         RSTRING(str)->as.heap.len = t - buf;
04905         STR_SET_NOEMBED(str);
04906         RSTRING(str)->as.heap.aux.capa = max;
04907     }
04908     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
04909         while (s < send) {
04910             c = (unsigned char)*s;
04911             if (trans[c] != errc) {
04912                 if (!cflag) {
04913                     c = trans[c];
04914                     *s = c;
04915                     modify = 1;
04916                 }
04917                 else {
04918                     *s = last;
04919                     modify = 1;
04920                 }
04921             }
04922             CHECK_IF_ASCII(c);
04923             s++;
04924         }
04925     }
04926     else {
04927         int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
04928         long offset;
04929         char *buf = ALLOC_N(char, max), *t = buf;
04930 
04931         while (s < send) {
04932             int may_modify = 0;
04933             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04934             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04935 
04936             if (c < 256) {
04937                 c = trans[c];
04938             }
04939             else if (hash) {
04940                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04941                 if (NIL_P(tmp)) {
04942                     if (cflag) c = last;
04943                     else c = errc;
04944                 }
04945                 else if (cflag) c = errc;
04946                 else c = NUM2INT(tmp);
04947             }
04948             else {
04949                 c = errc;
04950             }
04951             if (c != errc) {
04952                 tlen = rb_enc_codelen(c, enc);
04953                 modify = 1;
04954             }
04955             else {
04956                 c = c0;
04957                 if (enc != e1) may_modify = 1;
04958             }
04959             while (t - buf + tlen >= max) {
04960                 offset = t - buf;
04961                 max *= 2;
04962                 REALLOC_N(buf, char, max);
04963                 t = buf + offset;
04964             }
04965             if (s != t) {
04966                 rb_enc_mbcput(c, t, enc);
04967                 if (may_modify && memcmp(s, t, tlen) != 0) {
04968                     modify = 1;
04969                 }
04970             }
04971             CHECK_IF_ASCII(c);
04972             s += clen;
04973             t += tlen;
04974         }
04975         if (!STR_EMBED_P(str)) {
04976             xfree(RSTRING(str)->as.heap.ptr);
04977         }
04978         *t = '\0';
04979         RSTRING(str)->as.heap.ptr = buf;
04980         RSTRING(str)->as.heap.len = t - buf;
04981         STR_SET_NOEMBED(str);
04982         RSTRING(str)->as.heap.aux.capa = max;
04983     }
04984 
04985     if (modify) {
04986         if (cr != ENC_CODERANGE_BROKEN)
04987             ENC_CODERANGE_SET(str, cr);
04988         rb_enc_associate(str, enc);
04989         return str;
04990     }
04991     return Qnil;
04992 }
04993 
04994 
04995 /*
04996  *  call-seq:
04997  *     str.tr!(from_str, to_str)   -> str or nil
04998  *
04999  *  Translates <i>str</i> in place, using the same rules as
05000  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
05001  *  changes were made.
05002  */
05003 
05004 static VALUE
05005 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05006 {
05007     return tr_trans(str, src, repl, 0);
05008 }
05009 
05010 
05011 /*
05012  *  call-seq:
05013  *     str.tr(from_str, to_str)   -> new_str
05014  *
05015  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i> replaced
05016  *  by the corresponding characters in <i>to_str</i>. If <i>to_str</i> is
05017  *  shorter than <i>from_str</i>, it is padded with its last character. Both
05018  *  strings may use the c1--c2 notation to denote ranges of characters, and
05019  *  <i>from_str</i> may start with a <code>^</code>, which denotes all
05020  *  characters except those listed.
05021  *
05022  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
05023  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
05024  *     "hello".tr('el', 'ip')      #=> "hippo"
05025  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
05026  */
05027 
05028 static VALUE
05029 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05030 {
05031     str = rb_str_dup(str);
05032     tr_trans(str, src, repl, 0);
05033     return str;
05034 }
05035 
05036 static void
05037 tr_setup_table(VALUE str, char stable[256], int first,
05038                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05039 {
05040     const unsigned int errc = -1;
05041     char buf[256];
05042     struct tr tr;
05043     unsigned int c;
05044     VALUE table = 0, ptable = 0;
05045     int i, l, cflag = 0;
05046 
05047     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05048     tr.gen = tr.now = tr.max = 0;
05049 
05050     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05051         cflag = 1;
05052         tr.p += l;
05053 
05054         table = rb_hash_new();
05055         ptable = *ctablep;
05056         *ctablep = table;
05057     }
05058     else {
05059         table = rb_hash_new();
05060         ptable = *tablep;
05061         *tablep = table;
05062     }
05063     if (first) {
05064         for (i=0; i<256; i++) {
05065             stable[i] = 1;
05066         }
05067     }
05068     for (i=0; i<256; i++) {
05069         buf[i] = cflag;
05070     }
05071 
05072     while ((c = trnext(&tr, enc)) != errc) {
05073         if (c < 256) {
05074             buf[c & 0xff] = !cflag;
05075         }
05076         else {
05077             VALUE key = UINT2NUM(c);
05078 
05079             if (!table) {
05080                 table = rb_hash_new();
05081                 ptable = *tablep;
05082                 *tablep = table;
05083             }
05084             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05085                 rb_hash_aset(table, key, Qtrue);
05086             }
05087         }
05088     }
05089     for (i=0; i<256; i++) {
05090         stable[i] = stable[i] && buf[i];
05091     }
05092 }
05093 
05094 
05095 static int
05096 tr_find(unsigned int c, char table[256], VALUE del, VALUE nodel)
05097 {
05098     if (c < 256) {
05099         return table[c] != 0;
05100     }
05101     else {
05102         VALUE v = UINT2NUM(c);
05103 
05104         if (del) {
05105             if (!NIL_P(rb_hash_lookup(del, v)) &&
05106                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05107                 return TRUE;
05108             }
05109         }
05110         else if (nodel && NIL_P(rb_hash_lookup(nodel, v))) {
05111             return TRUE;
05112         }
05113         return FALSE;
05114     }
05115 }
05116 
05117 /*
05118  *  call-seq:
05119  *     str.delete!([other_str]+)   -> str or nil
05120  *
05121  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
05122  *  <code>nil</code> if <i>str</i> was not modified.
05123  */
05124 
05125 static VALUE
05126 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05127 {
05128     char squeez[256];
05129     rb_encoding *enc = 0;
05130     char *s, *send, *t;
05131     VALUE del = 0, nodel = 0;
05132     int modify = 0;
05133     int i, ascompat, cr;
05134 
05135     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05136     if (argc < 1) {
05137         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05138     }
05139     for (i=0; i<argc; i++) {
05140         VALUE s = argv[i];
05141 
05142         StringValue(s);
05143         enc = rb_enc_check(str, s);
05144         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05145     }
05146 
05147     str_modify_keep_cr(str);
05148     ascompat = rb_enc_asciicompat(enc);
05149     s = t = RSTRING_PTR(str);
05150     send = RSTRING_END(str);
05151     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05152     while (s < send) {
05153         unsigned int c;
05154         int clen;
05155 
05156         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05157             if (squeez[c]) {
05158                 modify = 1;
05159             }
05160             else {
05161                 if (t != s) *t = c;
05162                 t++;
05163             }
05164             s++;
05165         }
05166         else {
05167             c = rb_enc_codepoint_len(s, send, &clen, enc);
05168 
05169             if (tr_find(c, squeez, del, nodel)) {
05170                 modify = 1;
05171             }
05172             else {
05173                 if (t != s) rb_enc_mbcput(c, t, enc);
05174                 t += clen;
05175                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05176             }
05177             s += clen;
05178         }
05179     }
05180     *t = '\0';
05181     STR_SET_LEN(str, t - RSTRING_PTR(str));
05182     ENC_CODERANGE_SET(str, cr);
05183 
05184     if (modify) return str;
05185     return Qnil;
05186 }
05187 
05188 
05189 /*
05190  *  call-seq:
05191  *     str.delete([other_str]+)   -> new_str
05192  *
05193  *  Returns a copy of <i>str</i> with all characters in the intersection of its
05194  *  arguments deleted. Uses the same rules for building the set of characters as
05195  *  <code>String#count</code>.
05196  *
05197  *     "hello".delete "l","lo"        #=> "heo"
05198  *     "hello".delete "lo"            #=> "he"
05199  *     "hello".delete "aeiou", "^e"   #=> "hell"
05200  *     "hello".delete "ej-m"          #=> "ho"
05201  */
05202 
05203 static VALUE
05204 rb_str_delete(int argc, VALUE *argv, VALUE str)
05205 {
05206     str = rb_str_dup(str);
05207     rb_str_delete_bang(argc, argv, str);
05208     return str;
05209 }
05210 
05211 
05212 /*
05213  *  call-seq:
05214  *     str.squeeze!([other_str]*)   -> str or nil
05215  *
05216  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
05217  *  <code>nil</code> if no changes were made.
05218  */
05219 
05220 static VALUE
05221 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05222 {
05223     char squeez[256];
05224     rb_encoding *enc = 0;
05225     VALUE del = 0, nodel = 0;
05226     char *s, *send, *t;
05227     int i, modify = 0;
05228     int ascompat, singlebyte = single_byte_optimizable(str);
05229     unsigned int save;
05230 
05231     if (argc == 0) {
05232         enc = STR_ENC_GET(str);
05233     }
05234     else {
05235         for (i=0; i<argc; i++) {
05236             VALUE s = argv[i];
05237 
05238             StringValue(s);
05239             enc = rb_enc_check(str, s);
05240             if (singlebyte && !single_byte_optimizable(s))
05241                 singlebyte = 0;
05242             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05243         }
05244     }
05245 
05246     str_modify_keep_cr(str);
05247     s = t = RSTRING_PTR(str);
05248     if (!s || RSTRING_LEN(str) == 0) return Qnil;
05249     send = RSTRING_END(str);
05250     save = -1;
05251     ascompat = rb_enc_asciicompat(enc);
05252 
05253     if (singlebyte) {
05254         while (s < send) {
05255             unsigned int c = *(unsigned char*)s++;
05256             if (c != save || (argc > 0 && !squeez[c])) {
05257                 *t++ = save = c;
05258             }
05259         }
05260     } else {
05261         while (s < send) {
05262             unsigned int c;
05263             int clen;
05264 
05265             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05266                 if (c != save || (argc > 0 && !squeez[c])) {
05267                     *t++ = save = c;
05268                 }
05269                 s++;
05270             }
05271             else {
05272                 c = rb_enc_codepoint_len(s, send, &clen, enc);
05273 
05274                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05275                     if (t != s) rb_enc_mbcput(c, t, enc);
05276                     save = c;
05277                     t += clen;
05278                 }
05279                 s += clen;
05280             }
05281         }
05282     }
05283 
05284     *t = '\0';
05285     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05286         STR_SET_LEN(str, t - RSTRING_PTR(str));
05287         modify = 1;
05288     }
05289 
05290     if (modify) return str;
05291     return Qnil;
05292 }
05293 
05294 
05295 /*
05296  *  call-seq:
05297  *     str.squeeze([other_str]*)    -> new_str
05298  *
05299  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
05300  *  procedure described for <code>String#count</code>. Returns a new string
05301  *  where runs of the same character that occur in this set are replaced by a
05302  *  single character. If no arguments are given, all runs of identical
05303  *  characters are replaced by a single character.
05304  *
05305  *     "yellow moon".squeeze                  #=> "yelow mon"
05306  *     "  now   is  the".squeeze(" ")         #=> " now is the"
05307  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
05308  */
05309 
05310 static VALUE
05311 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05312 {
05313     str = rb_str_dup(str);
05314     rb_str_squeeze_bang(argc, argv, str);
05315     return str;
05316 }
05317 
05318 
05319 /*
05320  *  call-seq:
05321  *     str.tr_s!(from_str, to_str)   -> str or nil
05322  *
05323  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
05324  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
05325  */
05326 
05327 static VALUE
05328 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05329 {
05330     return tr_trans(str, src, repl, 1);
05331 }
05332 
05333 
05334 /*
05335  *  call-seq:
05336  *     str.tr_s(from_str, to_str)   -> new_str
05337  *
05338  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
05339  *  then removes duplicate characters in regions that were affected by the
05340  *  translation.
05341  *
05342  *     "hello".tr_s('l', 'r')     #=> "hero"
05343  *     "hello".tr_s('el', '*')    #=> "h*o"
05344  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
05345  */
05346 
05347 static VALUE
05348 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05349 {
05350     str = rb_str_dup(str);
05351     tr_trans(str, src, repl, 1);
05352     return str;
05353 }
05354 
05355 
05356 /*
05357  *  call-seq:
05358  *     str.count([other_str]+)   -> fixnum
05359  *
05360  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
05361  *  intersection of these sets defines the characters to count in
05362  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
05363  *  negated. The sequence c1--c2 means all characters between c1 and c2.
05364  *
05365  *     a = "hello world"
05366  *     a.count "lo"            #=> 5
05367  *     a.count "lo", "o"       #=> 2
05368  *     a.count "hello", "^l"   #=> 4
05369  *     a.count "ej-m"          #=> 4
05370  */
05371 
05372 static VALUE
05373 rb_str_count(int argc, VALUE *argv, VALUE str)
05374 {
05375     char table[256];
05376     rb_encoding *enc = 0;
05377     VALUE del = 0, nodel = 0;
05378     char *s, *send;
05379     int i;
05380     int ascompat;
05381 
05382     if (argc < 1) {
05383         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05384     }
05385     for (i=0; i<argc; i++) {
05386         VALUE tstr = argv[i];
05387         unsigned char c;
05388 
05389         StringValue(tstr);
05390         enc = rb_enc_check(str, tstr);
05391         if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05392             (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05393             int n = 0;
05394 
05395             s = RSTRING_PTR(str);
05396             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05397             send = RSTRING_END(str);
05398             while (s < send) {
05399                 if (*(unsigned char*)s++ == c) n++;
05400             }
05401             return INT2NUM(n);
05402         }
05403         tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05404     }
05405 
05406     s = RSTRING_PTR(str);
05407     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05408     send = RSTRING_END(str);
05409     ascompat = rb_enc_asciicompat(enc);
05410     i = 0;
05411     while (s < send) {
05412         unsigned int c;
05413 
05414         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05415             if (table[c]) {
05416                 i++;
05417             }
05418             s++;
05419         }
05420         else {
05421             int clen;
05422             c = rb_enc_codepoint_len(s, send, &clen, enc);
05423             if (tr_find(c, table, del, nodel)) {
05424                 i++;
05425             }
05426             s += clen;
05427         }
05428     }
05429 
05430     return INT2NUM(i);
05431 }
05432 
05433 static const char isspacetable[256] = {
05434     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05435     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05436     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05437     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05438     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05439     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05440     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05441     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05442     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05443     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05444     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05445     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05446     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05447     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05448     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05449     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05450 };
05451 
05452 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05453 
05454 /*
05455  *  call-seq:
05456  *     str.split(pattern=$;, [limit])   -> anArray
05457  *
05458  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
05459  *  of these substrings.
05460  *
05461  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
05462  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
05463  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
05464  *  of contiguous whitespace characters ignored.
05465  *
05466  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
05467  *  pattern matches. Whenever the pattern matches a zero-length string,
05468  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
05469  *  groups, the respective matches will be returned in the array as well.
05470  *
05471  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
05472  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
05473  *  split on whitespace as if ` ' were specified.
05474  *
05475  *  If the <i>limit</i> parameter is omitted, trailing null fields are
05476  *  suppressed. If <i>limit</i> is a positive number, at most that number of
05477  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
05478  *  string is returned as the only entry in an array). If negative, there is no
05479  *  limit to the number of fields returned, and trailing null fields are not
05480  *  suppressed.
05481  *
05482  *     " now's  the time".split        #=> ["now's", "the", "time"]
05483  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
05484  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
05485  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
05486  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
05487  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
05488  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
05489  *
05490  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
05491  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
05492  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
05493  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
05494  */
05495 
05496 static VALUE
05497 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05498 {
05499     rb_encoding *enc;
05500     VALUE spat;
05501     VALUE limit;
05502     enum {awk, string, regexp} split_type;
05503     long beg, end, i = 0;
05504     int lim = 0;
05505     VALUE result, tmp;
05506 
05507     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05508         lim = NUM2INT(limit);
05509         if (lim <= 0) limit = Qnil;
05510         else if (lim == 1) {
05511             if (RSTRING_LEN(str) == 0)
05512                 return rb_ary_new2(0);
05513             return rb_ary_new3(1, str);
05514         }
05515         i = 1;
05516     }
05517 
05518     enc = STR_ENC_GET(str);
05519     if (NIL_P(spat)) {
05520         if (!NIL_P(rb_fs)) {
05521             spat = rb_fs;
05522             goto fs_set;
05523         }
05524         split_type = awk;
05525     }
05526     else {
05527       fs_set:
05528         if (TYPE(spat) == T_STRING) {
05529             rb_encoding *enc2 = STR_ENC_GET(spat);
05530 
05531             split_type = string;
05532             if (RSTRING_LEN(spat) == 0) {
05533                 /* Special case - split into chars */
05534                 spat = rb_reg_regcomp(spat);
05535                 split_type = regexp;
05536             }
05537             else if (rb_enc_asciicompat(enc2) == 1) {
05538                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05539                     split_type = awk;
05540                 }
05541             }
05542             else {
05543                 int l;
05544                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05545                     RSTRING_LEN(spat) == l) {
05546                     split_type = awk;
05547                 }
05548             }
05549         }
05550         else {
05551             spat = get_pat(spat, 1);
05552             split_type = regexp;
05553         }
05554     }
05555 
05556     result = rb_ary_new();
05557     beg = 0;
05558     if (split_type == awk) {
05559         char *ptr = RSTRING_PTR(str);
05560         char *eptr = RSTRING_END(str);
05561         char *bptr = ptr;
05562         int skip = 1;
05563         unsigned int c;
05564 
05565         end = beg;
05566         if (is_ascii_string(str)) {
05567             while (ptr < eptr) {
05568                 c = (unsigned char)*ptr++;
05569                 if (skip) {
05570                     if (ascii_isspace(c)) {
05571                         beg = ptr - bptr;
05572                     }
05573                     else {
05574                         end = ptr - bptr;
05575                         skip = 0;
05576                         if (!NIL_P(limit) && lim <= i) break;
05577                     }
05578                 }
05579                 else if (ascii_isspace(c)) {
05580                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05581                     skip = 1;
05582                     beg = ptr - bptr;
05583                     if (!NIL_P(limit)) ++i;
05584                 }
05585                 else {
05586                     end = ptr - bptr;
05587                 }
05588             }
05589         }
05590         else {
05591             while (ptr < eptr) {
05592                 int n;
05593 
05594                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05595                 ptr += n;
05596                 if (skip) {
05597                     if (rb_isspace(c)) {
05598                         beg = ptr - bptr;
05599                     }
05600                     else {
05601                         end = ptr - bptr;
05602                         skip = 0;
05603                         if (!NIL_P(limit) && lim <= i) break;
05604                     }
05605                 }
05606                 else if (rb_isspace(c)) {
05607                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05608                     skip = 1;
05609                     beg = ptr - bptr;
05610                     if (!NIL_P(limit)) ++i;
05611                 }
05612                 else {
05613                     end = ptr - bptr;
05614                 }
05615             }
05616         }
05617     }
05618     else if (split_type == string) {
05619         char *ptr = RSTRING_PTR(str);
05620         char *temp = ptr;
05621         char *eptr = RSTRING_END(str);
05622         char *sptr = RSTRING_PTR(spat);
05623         long slen = RSTRING_LEN(spat);
05624 
05625         if (is_broken_string(str)) {
05626             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05627         }
05628         if (is_broken_string(spat)) {
05629             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05630         }
05631         enc = rb_enc_check(str, spat);
05632         while (ptr < eptr &&
05633                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05634             /* Check we are at the start of a char */
05635             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05636             if (t != ptr + end) {
05637                 ptr = t;
05638                 continue;
05639             }
05640             rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05641             ptr += end + slen;
05642             if (!NIL_P(limit) && lim <= ++i) break;
05643         }
05644         beg = ptr - temp;
05645     }
05646     else {
05647         char *ptr = RSTRING_PTR(str);
05648         long len = RSTRING_LEN(str);
05649         long start = beg;
05650         long idx;
05651         int last_null = 0;
05652         struct re_registers *regs;
05653 
05654         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05655             regs = RMATCH_REGS(rb_backref_get());
05656             if (start == end && BEG(0) == END(0)) {
05657                 if (!ptr) {
05658                     rb_ary_push(result, str_new_empty(str));
05659                     break;
05660                 }
05661                 else if (last_null == 1) {
05662                     rb_ary_push(result, rb_str_subseq(str, beg,
05663                                                       rb_enc_fast_mbclen(ptr+beg,
05664                                                                          ptr+len,
05665                                                                          enc)));
05666                     beg = start;
05667                 }
05668                 else {
05669                     if (ptr+start == ptr+len)
05670                         start++;
05671                     else
05672                         start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05673                     last_null = 1;
05674                     continue;
05675                 }
05676             }
05677             else {
05678                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05679                 beg = start = END(0);
05680             }
05681             last_null = 0;
05682 
05683             for (idx=1; idx < regs->num_regs; idx++) {
05684                 if (BEG(idx) == -1) continue;
05685                 if (BEG(idx) == END(idx))
05686                     tmp = str_new_empty(str);
05687                 else
05688                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05689                 rb_ary_push(result, tmp);
05690             }
05691             if (!NIL_P(limit) && lim <= ++i) break;
05692         }
05693     }
05694     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05695         if (RSTRING_LEN(str) == beg)
05696             tmp = str_new_empty(str);
05697         else
05698             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05699         rb_ary_push(result, tmp);
05700     }
05701     if (NIL_P(limit) && lim == 0) {
05702         long len;
05703         while ((len = RARRAY_LEN(result)) > 0 &&
05704                (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05705             rb_ary_pop(result);
05706     }
05707 
05708     return result;
05709 }
05710 
05711 VALUE
05712 rb_str_split(VALUE str, const char *sep0)
05713 {
05714     VALUE sep;
05715 
05716     StringValue(str);
05717     sep = rb_str_new2(sep0);
05718     return rb_str_split_m(1, &sep, str);
05719 }
05720 
05721 
05722 /*
05723  *  call-seq:
05724  *     str.each_line(separator=$/) {|substr| block }   -> str
05725  *     str.each_line(separator=$/)                     -> an_enumerator
05726  *
05727  *     str.lines(separator=$/) {|substr| block }       -> str
05728  *     str.lines(separator=$/)                         -> an_enumerator
05729  *
05730  *  Splits <i>str</i> using the supplied parameter as the record separator
05731  *  (<code>$/</code> by default), passing each substring in turn to the supplied
05732  *  block. If a zero-length record separator is supplied, the string is split
05733  *  into paragraphs delimited by multiple successive newlines.
05734  *
05735  *  If no block is given, an enumerator is returned instead.
05736  *
05737  *     print "Example one\n"
05738  *     "hello\nworld".each_line {|s| p s}
05739  *     print "Example two\n"
05740  *     "hello\nworld".each_line('l') {|s| p s}
05741  *     print "Example three\n"
05742  *     "hello\n\n\nworld".each_line('') {|s| p s}
05743  *
05744  *  <em>produces:</em>
05745  *
05746  *     Example one
05747  *     "hello\n"
05748  *     "world"
05749  *     Example two
05750  *     "hel"
05751  *     "l"
05752  *     "o\nworl"
05753  *     "d"
05754  *     Example three
05755  *     "hello\n\n\n"
05756  *     "world"
05757  */
05758 
05759 static VALUE
05760 rb_str_each_line(int argc, VALUE *argv, VALUE str)
05761 {
05762     rb_encoding *enc;
05763     VALUE rs;
05764     unsigned int newline;
05765     const char *p, *pend, *s, *ptr;
05766     long len, rslen;
05767     VALUE line;
05768     int n;
05769     VALUE orig = str;
05770 
05771     if (argc == 0) {
05772         rs = rb_rs;
05773     }
05774     else {
05775         rb_scan_args(argc, argv, "01", &rs);
05776     }
05777     RETURN_ENUMERATOR(str, argc, argv);
05778     if (NIL_P(rs)) {
05779         rb_yield(str);
05780         return orig;
05781     }
05782     str = rb_str_new4(str);
05783     ptr = p = s = RSTRING_PTR(str);
05784     pend = p + RSTRING_LEN(str);
05785     len = RSTRING_LEN(str);
05786     StringValue(rs);
05787     if (rs == rb_default_rs) {
05788         enc = rb_enc_get(str);
05789         while (p < pend) {
05790             char *p0;
05791 
05792             p = memchr(p, '\n', pend - p);
05793             if (!p) break;
05794             p0 = rb_enc_left_char_head(s, p, pend, enc);
05795             if (!rb_enc_is_newline(p0, pend, enc)) {
05796                 p++;
05797                 continue;
05798             }
05799             p = p0 + rb_enc_mbclen(p0, pend, enc);
05800             line = rb_str_new5(str, s, p - s);
05801             OBJ_INFECT(line, str);
05802             rb_enc_cr_str_copy_for_substr(line, str);
05803             rb_yield(line);
05804             str_mod_check(str, ptr, len);
05805             s = p;
05806         }
05807         goto finish;
05808     }
05809 
05810     enc = rb_enc_check(str, rs);
05811     rslen = RSTRING_LEN(rs);
05812     if (rslen == 0) {
05813         newline = '\n';
05814     }
05815     else {
05816         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
05817     }
05818 
05819     while (p < pend) {
05820         unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
05821 
05822       again:
05823         if (rslen == 0 && c == newline) {
05824             p += n;
05825             if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
05826                 goto again;
05827             }
05828             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
05829                 p += n;
05830             }
05831             p -= n;
05832         }
05833         if (c == newline &&
05834             (rslen <= 1 ||
05835              (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
05836             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
05837             OBJ_INFECT(line, str);
05838             rb_enc_cr_str_copy_for_substr(line, str);
05839             rb_yield(line);
05840             str_mod_check(str, ptr, len);
05841             s = p + (rslen ? rslen : n);
05842         }
05843         p += n;
05844     }
05845 
05846   finish:
05847     if (s != pend) {
05848         line = rb_str_new5(str, s, pend - s);
05849         OBJ_INFECT(line, str);
05850         rb_enc_cr_str_copy_for_substr(line, str);
05851         rb_yield(line);
05852     }
05853 
05854     return orig;
05855 }
05856 
05857 
05858 /*
05859  *  call-seq:
05860  *     str.bytes {|fixnum| block }        -> str
05861  *     str.bytes                          -> an_enumerator
05862  *
05863  *     str.each_byte {|fixnum| block }    -> str
05864  *     str.each_byte                      -> an_enumerator
05865  *
05866  *  Passes each byte in <i>str</i> to the given block, or returns
05867  *  an enumerator if no block is given.
05868  *
05869  *     "hello".each_byte {|c| print c, ' ' }
05870  *
05871  *  <em>produces:</em>
05872  *
05873  *     104 101 108 108 111
05874  */
05875 
05876 static VALUE
05877 rb_str_each_byte(VALUE str)
05878 {
05879     long i;
05880 
05881     RETURN_ENUMERATOR(str, 0, 0);
05882     for (i=0; i<RSTRING_LEN(str); i++) {
05883         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
05884     }
05885     return str;
05886 }
05887 
05888 
05889 /*
05890  *  call-seq:
05891  *     str.chars {|cstr| block }        -> str
05892  *     str.chars                        -> an_enumerator
05893  *
05894  *     str.each_char {|cstr| block }    -> str
05895  *     str.each_char                    -> an_enumerator
05896  *
05897  *  Passes each character in <i>str</i> to the given block, or returns
05898  *  an enumerator if no block is given.
05899  *
05900  *     "hello".each_char {|c| print c, ' ' }
05901  *
05902  *  <em>produces:</em>
05903  *
05904  *     h e l l o
05905  */
05906 
05907 static VALUE
05908 rb_str_each_char(VALUE str)
05909 {
05910     VALUE orig = str;
05911     long i, len, n;
05912     const char *ptr;
05913     rb_encoding *enc;
05914 
05915     RETURN_ENUMERATOR(str, 0, 0);
05916     str = rb_str_new4(str);
05917     ptr = RSTRING_PTR(str);
05918     len = RSTRING_LEN(str);
05919     enc = rb_enc_get(str);
05920     switch (ENC_CODERANGE(str)) {
05921       case ENC_CODERANGE_VALID:
05922       case ENC_CODERANGE_7BIT:
05923         for (i = 0; i < len; i += n) {
05924             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
05925             rb_yield(rb_str_subseq(str, i, n));
05926         }
05927         break;
05928       default:
05929         for (i = 0; i < len; i += n) {
05930             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
05931             rb_yield(rb_str_subseq(str, i, n));
05932         }
05933     }
05934     return orig;
05935 }
05936 
05937 /*
05938  *  call-seq:
05939  *     str.codepoints {|integer| block }        -> str
05940  *     str.codepoints                           -> an_enumerator
05941  *
05942  *     str.each_codepoint {|integer| block }    -> str
05943  *     str.each_codepoint                       -> an_enumerator
05944  *
05945  *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
05946  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
05947  *  given block.
05948  *
05949  *  If no block is given, an enumerator is returned instead.
05950  *
05951  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
05952  *
05953  *  <em>produces:</em>
05954  *
05955  *     104 101 108 108 111 1593
05956  */
05957 
05958 static VALUE
05959 rb_str_each_codepoint(VALUE str)
05960 {
05961     VALUE orig = str;
05962     long len;
05963     int n;
05964     unsigned int c;
05965     const char *ptr, *end;
05966     rb_encoding *enc;
05967 
05968     if (single_byte_optimizable(str)) return rb_str_each_byte(str);
05969     RETURN_ENUMERATOR(str, 0, 0);
05970     str = rb_str_new4(str);
05971     ptr = RSTRING_PTR(str);
05972     len = RSTRING_LEN(str);
05973     end = RSTRING_END(str);
05974     enc = STR_ENC_GET(str);
05975     while (ptr < end) {
05976         c = rb_enc_codepoint_len(ptr, end, &n, enc);
05977         rb_yield(UINT2NUM(c));
05978         ptr += n;
05979     }
05980     return orig;
05981 }
05982 
05983 static long
05984 chopped_length(VALUE str)
05985 {
05986     rb_encoding *enc = STR_ENC_GET(str);
05987     const char *p, *p2, *beg, *end;
05988 
05989     beg = RSTRING_PTR(str);
05990     end = beg + RSTRING_LEN(str);
05991     if (beg > end) return 0;
05992     p = rb_enc_prev_char(beg, end, end, enc);
05993     if (!p) return 0;
05994     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
05995         p2 = rb_enc_prev_char(beg, p, end, enc);
05996         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
05997     }
05998     return p - beg;
05999 }
06000 
06001 /*
06002  *  call-seq:
06003  *     str.chop!   -> str or nil
06004  *
06005  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
06006  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
06007  *  <code>String#chomp!</code>.
06008  */
06009 
06010 static VALUE
06011 rb_str_chop_bang(VALUE str)
06012 {
06013     str_modify_keep_cr(str);
06014     if (RSTRING_LEN(str) > 0) {
06015         long len;
06016         len = chopped_length(str);
06017         STR_SET_LEN(str, len);
06018         RSTRING_PTR(str)[len] = '\0';
06019         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06020             ENC_CODERANGE_CLEAR(str);
06021         }
06022         return str;
06023     }
06024     return Qnil;
06025 }
06026 
06027 
06028 /*
06029  *  call-seq:
06030  *     str.chop   -> new_str
06031  *
06032  *  Returns a new <code>String</code> with the last character removed.  If the
06033  *  string ends with <code>\r\n</code>, both characters are removed. Applying
06034  *  <code>chop</code> to an empty string returns an empty
06035  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
06036  *  the string unchanged if it doesn't end in a record separator.
06037  *
06038  *     "string\r\n".chop   #=> "string"
06039  *     "string\n\r".chop   #=> "string\n"
06040  *     "string\n".chop     #=> "string"
06041  *     "string".chop       #=> "strin"
06042  *     "x".chop.chop       #=> ""
06043  */
06044 
06045 static VALUE
06046 rb_str_chop(VALUE str)
06047 {
06048     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06049     rb_enc_cr_str_copy_for_substr(str2, str);
06050     OBJ_INFECT(str2, str);
06051     return str2;
06052 }
06053 
06054 
06055 /*
06056  *  call-seq:
06057  *     str.chomp!(separator=$/)   -> str or nil
06058  *
06059  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
06060  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
06061  */
06062 
06063 static VALUE
06064 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06065 {
06066     rb_encoding *enc;
06067     VALUE rs;
06068     int newline;
06069     char *p, *pp, *e;
06070     long len, rslen;
06071 
06072     str_modify_keep_cr(str);
06073     len = RSTRING_LEN(str);
06074     if (len == 0) return Qnil;
06075     p = RSTRING_PTR(str);
06076     e = p + len;
06077     if (argc == 0) {
06078         rs = rb_rs;
06079         if (rs == rb_default_rs) {
06080           smart_chomp:
06081             enc = rb_enc_get(str);
06082             if (rb_enc_mbminlen(enc) > 1) {
06083                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06084                 if (rb_enc_is_newline(pp, e, enc)) {
06085                     e = pp;
06086                 }
06087                 pp = e - rb_enc_mbminlen(enc);
06088                 if (pp >= p) {
06089                     pp = rb_enc_left_char_head(p, pp, e, enc);
06090                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06091                         e = pp;
06092                     }
06093                 }
06094                 if (e == RSTRING_END(str)) {
06095                     return Qnil;
06096                 }
06097                 len = e - RSTRING_PTR(str);
06098                 STR_SET_LEN(str, len);
06099             }
06100             else {
06101                 if (RSTRING_PTR(str)[len-1] == '\n') {
06102                     STR_DEC_LEN(str);
06103                     if (RSTRING_LEN(str) > 0 &&
06104                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06105                         STR_DEC_LEN(str);
06106                     }
06107                 }
06108                 else if (RSTRING_PTR(str)[len-1] == '\r') {
06109                     STR_DEC_LEN(str);
06110                 }
06111                 else {
06112                     return Qnil;
06113                 }
06114             }
06115             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06116             return str;
06117         }
06118     }
06119     else {
06120         rb_scan_args(argc, argv, "01", &rs);
06121     }
06122     if (NIL_P(rs)) return Qnil;
06123     StringValue(rs);
06124     rslen = RSTRING_LEN(rs);
06125     if (rslen == 0) {
06126         while (len>0 && p[len-1] == '\n') {
06127             len--;
06128             if (len>0 && p[len-1] == '\r')
06129                 len--;
06130         }
06131         if (len < RSTRING_LEN(str)) {
06132             STR_SET_LEN(str, len);
06133             RSTRING_PTR(str)[len] = '\0';
06134             return str;
06135         }
06136         return Qnil;
06137     }
06138     if (rslen > len) return Qnil;
06139     newline = RSTRING_PTR(rs)[rslen-1];
06140     if (rslen == 1 && newline == '\n')
06141         goto smart_chomp;
06142 
06143     enc = rb_enc_check(str, rs);
06144     if (is_broken_string(rs)) {
06145         return Qnil;
06146     }
06147     pp = e - rslen;
06148     if (p[len-1] == newline &&
06149         (rslen <= 1 ||
06150          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06151         if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06152             return Qnil;
06153         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06154             ENC_CODERANGE_CLEAR(str);
06155         }
06156         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06157         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06158         return str;
06159     }
06160     return Qnil;
06161 }
06162 
06163 
06164 /*
06165  *  call-seq:
06166  *     str.chomp(separator=$/)   -> new_str
06167  *
06168  *  Returns a new <code>String</code> with the given record separator removed
06169  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
06170  *  changed from the default Ruby record separator, then <code>chomp</code> also
06171  *  removes carriage return characters (that is it will remove <code>\n</code>,
06172  *  <code>\r</code>, and <code>\r\n</code>).
06173  *
06174  *     "hello".chomp            #=> "hello"
06175  *     "hello\n".chomp          #=> "hello"
06176  *     "hello\r\n".chomp        #=> "hello"
06177  *     "hello\n\r".chomp        #=> "hello\n"
06178  *     "hello\r".chomp          #=> "hello"
06179  *     "hello \n there".chomp   #=> "hello \n there"
06180  *     "hello".chomp("llo")     #=> "he"
06181  */
06182 
06183 static VALUE
06184 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06185 {
06186     str = rb_str_dup(str);
06187     rb_str_chomp_bang(argc, argv, str);
06188     return str;
06189 }
06190 
06191 /*
06192  *  call-seq:
06193  *     str.lstrip!   -> self or nil
06194  *
06195  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
06196  *  change was made. See also <code>String#rstrip!</code> and
06197  *  <code>String#strip!</code>.
06198  *
06199  *     "  hello  ".lstrip   #=> "hello  "
06200  *     "hello".lstrip!      #=> nil
06201  */
06202 
06203 static VALUE
06204 rb_str_lstrip_bang(VALUE str)
06205 {
06206     rb_encoding *enc;
06207     char *s, *t, *e;
06208 
06209     str_modify_keep_cr(str);
06210     enc = STR_ENC_GET(str);
06211     s = RSTRING_PTR(str);
06212     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06213     e = t = RSTRING_END(str);
06214     /* remove spaces at head */
06215     while (s < e) {
06216         int n;
06217         unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06218 
06219         if (!rb_isspace(cc)) break;
06220         s += n;
06221     }
06222 
06223     if (s > RSTRING_PTR(str)) {
06224         STR_SET_LEN(str, t-s);
06225         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06226         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06227         return str;
06228     }
06229     return Qnil;
06230 }
06231 
06232 
06233 /*
06234  *  call-seq:
06235  *     str.lstrip   -> new_str
06236  *
06237  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
06238  *  <code>String#rstrip</code> and <code>String#strip</code>.
06239  *
06240  *     "  hello  ".lstrip   #=> "hello  "
06241  *     "hello".lstrip       #=> "hello"
06242  */
06243 
06244 static VALUE
06245 rb_str_lstrip(VALUE str)
06246 {
06247     str = rb_str_dup(str);
06248     rb_str_lstrip_bang(str);
06249     return str;
06250 }
06251 
06252 
06253 /*
06254  *  call-seq:
06255  *     str.rstrip!   -> self or nil
06256  *
06257  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
06258  *  no change was made. See also <code>String#lstrip!</code> and
06259  *  <code>String#strip!</code>.
06260  *
06261  *     "  hello  ".rstrip   #=> "  hello"
06262  *     "hello".rstrip!      #=> nil
06263  */
06264 
06265 static VALUE
06266 rb_str_rstrip_bang(VALUE str)
06267 {
06268     rb_encoding *enc;
06269     char *s, *t, *e;
06270 
06271     str_modify_keep_cr(str);
06272     enc = STR_ENC_GET(str);
06273     rb_str_check_dummy_enc(enc);
06274     s = RSTRING_PTR(str);
06275     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06276     t = e = RSTRING_END(str);
06277 
06278     /* remove trailing spaces or '\0's */
06279     if (single_byte_optimizable(str)) {
06280         unsigned char c;
06281         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06282     }
06283     else {
06284         char *tp;
06285 
06286         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06287             unsigned int c = rb_enc_codepoint(tp, e, enc);
06288             if (c && !rb_isspace(c)) break;
06289             t = tp;
06290         }
06291     }
06292     if (t < e) {
06293         long len = t-RSTRING_PTR(str);
06294 
06295         STR_SET_LEN(str, len);
06296         RSTRING_PTR(str)[len] = '\0';
06297         return str;
06298     }
06299     return Qnil;
06300 }
06301 
06302 
06303 /*
06304  *  call-seq:
06305  *     str.rstrip   -> new_str
06306  *
06307  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
06308  *  <code>String#lstrip</code> and <code>String#strip</code>.
06309  *
06310  *     "  hello  ".rstrip   #=> "  hello"
06311  *     "hello".rstrip       #=> "hello"
06312  */
06313 
06314 static VALUE
06315 rb_str_rstrip(VALUE str)
06316 {
06317     str = rb_str_dup(str);
06318     rb_str_rstrip_bang(str);
06319     return str;
06320 }
06321 
06322 
06323 /*
06324  *  call-seq:
06325  *     str.strip!   -> str or nil
06326  *
06327  *  Removes leading and trailing whitespace from <i>str</i>. Returns
06328  *  <code>nil</code> if <i>str</i> was not altered.
06329  */
06330 
06331 static VALUE
06332 rb_str_strip_bang(VALUE str)
06333 {
06334     VALUE l = rb_str_lstrip_bang(str);
06335     VALUE r = rb_str_rstrip_bang(str);
06336 
06337     if (NIL_P(l) && NIL_P(r)) return Qnil;
06338     return str;
06339 }
06340 
06341 
06342 /*
06343  *  call-seq:
06344  *     str.strip   -> new_str
06345  *
06346  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
06347  *
06348  *     "    hello    ".strip   #=> "hello"
06349  *     "\tgoodbye\r\n".strip   #=> "goodbye"
06350  */
06351 
06352 static VALUE
06353 rb_str_strip(VALUE str)
06354 {
06355     str = rb_str_dup(str);
06356     rb_str_strip_bang(str);
06357     return str;
06358 }
06359 
06360 static VALUE
06361 scan_once(VALUE str, VALUE pat, long *start)
06362 {
06363     VALUE result, match;
06364     struct re_registers *regs;
06365     int i;
06366 
06367     if (rb_reg_search(pat, str, *start, 0) >= 0) {
06368         match = rb_backref_get();
06369         regs = RMATCH_REGS(match);
06370         if (BEG(0) == END(0)) {
06371             rb_encoding *enc = STR_ENC_GET(str);
06372             /*
06373              * Always consume at least one character of the input string
06374              */
06375             if (RSTRING_LEN(str) > END(0))
06376                 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06377                                                    RSTRING_END(str), enc);
06378             else
06379                 *start = END(0)+1;
06380         }
06381         else {
06382             *start = END(0);
06383         }
06384         if (regs->num_regs == 1) {
06385             return rb_reg_nth_match(0, match);
06386         }
06387         result = rb_ary_new2(regs->num_regs);
06388         for (i=1; i < regs->num_regs; i++) {
06389             rb_ary_push(result, rb_reg_nth_match(i, match));
06390         }
06391 
06392         return result;
06393     }
06394     return Qnil;
06395 }
06396 
06397 
06398 /*
06399  *  call-seq:
06400  *     str.scan(pattern)                         -> array
06401  *     str.scan(pattern) {|match, ...| block }   -> str
06402  *
06403  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
06404  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
06405  *  generated and either added to the result array or passed to the block. If
06406  *  the pattern contains no groups, each individual result consists of the
06407  *  matched string, <code>$&</code>.  If the pattern contains groups, each
06408  *  individual result is itself an array containing one entry per group.
06409  *
06410  *     a = "cruel world"
06411  *     a.scan(/\w+/)        #=> ["cruel", "world"]
06412  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
06413  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
06414  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
06415  *
06416  *  And the block form:
06417  *
06418  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
06419  *     print "\n"
06420  *     a.scan(/(.)(.)/) {|x,y| print y, x }
06421  *     print "\n"
06422  *
06423  *  <em>produces:</em>
06424  *
06425  *     <<cruel>> <<world>>
06426  *     rceu lowlr
06427  */
06428 
06429 static VALUE
06430 rb_str_scan(VALUE str, VALUE pat)
06431 {
06432     VALUE result;
06433     long start = 0;
06434     long last = -1, prev = 0;
06435     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06436 
06437     pat = get_pat(pat, 1);
06438     if (!rb_block_given_p()) {
06439         VALUE ary = rb_ary_new();
06440 
06441         while (!NIL_P(result = scan_once(str, pat, &start))) {
06442             last = prev;
06443             prev = start;
06444             rb_ary_push(ary, result);
06445         }
06446         if (last >= 0) rb_reg_search(pat, str, last, 0);
06447         return ary;
06448     }
06449 
06450     while (!NIL_P(result = scan_once(str, pat, &start))) {
06451         last = prev;
06452         prev = start;
06453         rb_yield(result);
06454         str_mod_check(str, p, len);
06455     }
06456     if (last >= 0) rb_reg_search(pat, str, last, 0);
06457     return str;
06458 }
06459 
06460 
06461 /*
06462  *  call-seq:
06463  *     str.hex   -> integer
06464  *
06465  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
06466  *  (with an optional sign and an optional <code>0x</code>) and returns the
06467  *  corresponding number. Zero is returned on error.
06468  *
06469  *     "0x0a".hex     #=> 10
06470  *     "-1234".hex    #=> -4660
06471  *     "0".hex        #=> 0
06472  *     "wombat".hex   #=> 0
06473  */
06474 
06475 static VALUE
06476 rb_str_hex(VALUE str)
06477 {
06478     rb_encoding *enc = rb_enc_get(str);
06479 
06480     if (!rb_enc_asciicompat(enc)) {
06481         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06482     }
06483     return rb_str_to_inum(str, 16, FALSE);
06484 }
06485 
06486 
06487 /*
06488  *  call-seq:
06489  *     str.oct   -> integer
06490  *
06491  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
06492  *  optional sign) and returns the corresponding number.  Returns 0 if the
06493  *  conversion fails.
06494  *
06495  *     "123".oct       #=> 83
06496  *     "-377".oct      #=> -255
06497  *     "bad".oct       #=> 0
06498  *     "0377bad".oct   #=> 255
06499  */
06500 
06501 static VALUE
06502 rb_str_oct(VALUE str)
06503 {
06504     rb_encoding *enc = rb_enc_get(str);
06505 
06506     if (!rb_enc_asciicompat(enc)) {
06507         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06508     }
06509     return rb_str_to_inum(str, -8, FALSE);
06510 }
06511 
06512 
06513 /*
06514  *  call-seq:
06515  *     str.crypt(other_str)   -> new_str
06516  *
06517  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
06518  *  library function <code>crypt</code>. The argument is the salt string, which
06519  *  should be two characters long, each character drawn from
06520  *  <code>[a-zA-Z0-9./]</code>.
06521  */
06522 
06523 static VALUE
06524 rb_str_crypt(VALUE str, VALUE salt)
06525 {
06526     extern char *crypt(const char *, const char *);
06527     VALUE result;
06528     const char *s, *saltp;
06529 #ifdef BROKEN_CRYPT
06530     char salt_8bit_clean[3];
06531 #endif
06532 
06533     StringValue(salt);
06534     if (RSTRING_LEN(salt) < 2)
06535         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06536 
06537     s = RSTRING_PTR(str);
06538     if (!s) s = "";
06539     saltp = RSTRING_PTR(salt);
06540 #ifdef BROKEN_CRYPT
06541     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06542         salt_8bit_clean[0] = saltp[0] & 0x7f;
06543         salt_8bit_clean[1] = saltp[1] & 0x7f;
06544         salt_8bit_clean[2] = '\0';
06545         saltp = salt_8bit_clean;
06546     }
06547 #endif
06548     result = rb_str_new2(crypt(s, saltp));
06549     OBJ_INFECT(result, str);
06550     OBJ_INFECT(result, salt);
06551     return result;
06552 }
06553 
06554 
06555 /*
06556  *  call-seq:
06557  *     str.intern   -> symbol
06558  *     str.to_sym   -> symbol
06559  *
06560  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
06561  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
06562  *
06563  *     "Koala".intern         #=> :Koala
06564  *     s = 'cat'.to_sym       #=> :cat
06565  *     s == :cat              #=> true
06566  *     s = '@cat'.to_sym      #=> :@cat
06567  *     s == :@cat             #=> true
06568  *
06569  *  This can also be used to create symbols that cannot be represented using the
06570  *  <code>:xxx</code> notation.
06571  *
06572  *     'cat and dog'.to_sym   #=> :"cat and dog"
06573  */
06574 
06575 VALUE
06576 rb_str_intern(VALUE s)
06577 {
06578     VALUE str = RB_GC_GUARD(s);
06579     ID id;
06580 
06581     id = rb_intern_str(str);
06582     return ID2SYM(id);
06583 }
06584 
06585 
06586 /*
06587  *  call-seq:
06588  *     str.ord   -> integer
06589  *
06590  *  Return the <code>Integer</code> ordinal of a one-character string.
06591  *
06592  *     "a".ord         #=> 97
06593  */
06594 
06595 VALUE
06596 rb_str_ord(VALUE s)
06597 {
06598     unsigned int c;
06599 
06600     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06601     return UINT2NUM(c);
06602 }
06603 /*
06604  *  call-seq:
06605  *     str.sum(n=16)   -> integer
06606  *
06607  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
06608  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
06609  *  to 16. The result is simply the sum of the binary value of each character in
06610  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
06611  *  checksum.
06612  */
06613 
06614 static VALUE
06615 rb_str_sum(int argc, VALUE *argv, VALUE str)
06616 {
06617     VALUE vbits;
06618     int bits;
06619     char *ptr, *p, *pend;
06620     long len;
06621     VALUE sum = INT2FIX(0);
06622     unsigned long sum0 = 0;
06623 
06624     if (argc == 0) {
06625         bits = 16;
06626     }
06627     else {
06628         rb_scan_args(argc, argv, "01", &vbits);
06629         bits = NUM2INT(vbits);
06630     }
06631     ptr = p = RSTRING_PTR(str);
06632     len = RSTRING_LEN(str);
06633     pend = p + len;
06634 
06635     while (p < pend) {
06636         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06637             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06638             str_mod_check(str, ptr, len);
06639             sum0 = 0;
06640         }
06641         sum0 += (unsigned char)*p;
06642         p++;
06643     }
06644 
06645     if (bits == 0) {
06646         if (sum0) {
06647             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06648         }
06649     }
06650     else {
06651         if (sum == INT2FIX(0)) {
06652             if (bits < (int)sizeof(long)*CHAR_BIT) {
06653                 sum0 &= (((unsigned long)1)<<bits)-1;
06654             }
06655             sum = LONG2FIX(sum0);
06656         }
06657         else {
06658             VALUE mod;
06659 
06660             if (sum0) {
06661                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06662             }
06663 
06664             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06665             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06666             sum = rb_funcall(sum, '&', 1, mod);
06667         }
06668     }
06669     return sum;
06670 }
06671 
06672 static VALUE
06673 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06674 {
06675     rb_encoding *enc;
06676     VALUE w;
06677     long width, len, flen = 1, fclen = 1;
06678     VALUE res;
06679     char *p;
06680     const char *f = " ";
06681     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06682     volatile VALUE pad;
06683     int singlebyte = 1, cr;
06684 
06685     rb_scan_args(argc, argv, "11", &w, &pad);
06686     enc = STR_ENC_GET(str);
06687     width = NUM2LONG(w);
06688     if (argc == 2) {
06689         StringValue(pad);
06690         enc = rb_enc_check(str, pad);
06691         f = RSTRING_PTR(pad);
06692         flen = RSTRING_LEN(pad);
06693         fclen = str_strlen(pad, enc);
06694         singlebyte = single_byte_optimizable(pad);
06695         if (flen == 0 || fclen == 0) {
06696             rb_raise(rb_eArgError, "zero width padding");
06697         }
06698     }
06699     len = str_strlen(str, enc);
06700     if (width < 0 || len >= width) return rb_str_dup(str);
06701     n = width - len;
06702     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06703     rlen = n - llen;
06704     cr = ENC_CODERANGE(str);
06705     if (flen > 1) {
06706        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06707        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06708     }
06709     size = RSTRING_LEN(str);
06710     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06711        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06712        (len += llen2 + rlen2) >= LONG_MAX - size) {
06713        rb_raise(rb_eArgError, "argument too big");
06714     }
06715     len += size;
06716     res = rb_str_new5(str, 0, len);
06717     p = RSTRING_PTR(res);
06718     if (flen <= 1) {
06719        memset(p, *f, llen);
06720        p += llen;
06721     }
06722     else {
06723        while (llen >= fclen) {
06724             memcpy(p,f,flen);
06725             p += flen;
06726             llen -= fclen;
06727         }
06728        if (llen > 0) {
06729            memcpy(p, f, llen2);
06730            p += llen2;
06731         }
06732     }
06733     memcpy(p, RSTRING_PTR(str), size);
06734     p += size;
06735     if (flen <= 1) {
06736        memset(p, *f, rlen);
06737        p += rlen;
06738     }
06739     else {
06740        while (rlen >= fclen) {
06741             memcpy(p,f,flen);
06742             p += flen;
06743             rlen -= fclen;
06744         }
06745        if (rlen > 0) {
06746            memcpy(p, f, rlen2);
06747            p += rlen2;
06748         }
06749     }
06750     *p = '\0';
06751     STR_SET_LEN(res, p-RSTRING_PTR(res));
06752     OBJ_INFECT(res, str);
06753     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
06754     rb_enc_associate(res, enc);
06755     if (argc == 2)
06756         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
06757     if (cr != ENC_CODERANGE_BROKEN)
06758         ENC_CODERANGE_SET(res, cr);
06759     return res;
06760 }
06761 
06762 
06763 /*
06764  *  call-seq:
06765  *     str.ljust(integer, padstr=' ')   -> new_str
06766  *
06767  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
06768  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
06769  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
06770  *
06771  *     "hello".ljust(4)            #=> "hello"
06772  *     "hello".ljust(20)           #=> "hello               "
06773  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
06774  */
06775 
06776 static VALUE
06777 rb_str_ljust(int argc, VALUE *argv, VALUE str)
06778 {
06779     return rb_str_justify(argc, argv, str, 'l');
06780 }
06781 
06782 
06783 /*
06784  *  call-seq:
06785  *     str.rjust(integer, padstr=' ')   -> new_str
06786  *
06787  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
06788  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
06789  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
06790  *
06791  *     "hello".rjust(4)            #=> "hello"
06792  *     "hello".rjust(20)           #=> "               hello"
06793  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
06794  */
06795 
06796 static VALUE
06797 rb_str_rjust(int argc, VALUE *argv, VALUE str)
06798 {
06799     return rb_str_justify(argc, argv, str, 'r');
06800 }
06801 
06802 
06803 /*
06804  *  call-seq:
06805  *     str.center(integer, padstr)   -> new_str
06806  *
06807  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
06808  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
06809  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
06810  *
06811  *     "hello".center(4)         #=> "hello"
06812  *     "hello".center(20)        #=> "       hello        "
06813  *     "hello".center(20, '123') #=> "1231231hello12312312"
06814  */
06815 
06816 static VALUE
06817 rb_str_center(int argc, VALUE *argv, VALUE str)
06818 {
06819     return rb_str_justify(argc, argv, str, 'c');
06820 }
06821 
06822 /*
06823  *  call-seq:
06824  *     str.partition(sep)              -> [head, sep, tail]
06825  *     str.partition(regexp)           -> [head, match, tail]
06826  *
06827  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
06828  *  and returns the part before it, the match, and the part
06829  *  after it.
06830  *  If it is not found, returns two empty strings and <i>str</i>.
06831  *
06832  *     "hello".partition("l")         #=> ["he", "l", "lo"]
06833  *     "hello".partition("x")         #=> ["hello", "", ""]
06834  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
06835  */
06836 
06837 static VALUE
06838 rb_str_partition(VALUE str, VALUE sep)
06839 {
06840     long pos;
06841     int regex = FALSE;
06842 
06843     if (TYPE(sep) == T_REGEXP) {
06844         pos = rb_reg_search(sep, str, 0, 0);
06845         regex = TRUE;
06846     }
06847     else {
06848         VALUE tmp;
06849 
06850         tmp = rb_check_string_type(sep);
06851         if (NIL_P(tmp)) {
06852             rb_raise(rb_eTypeError, "type mismatch: %s given",
06853                      rb_obj_classname(sep));
06854         }
06855         sep = tmp;
06856         pos = rb_str_index(str, sep, 0);
06857     }
06858     if (pos < 0) {
06859       failed:
06860         return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
06861     }
06862     if (regex) {
06863         sep = rb_str_subpat(str, sep, INT2FIX(0));
06864         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
06865     }
06866     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
06867                           sep,
06868                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
06869                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
06870 }
06871 
06872 /*
06873  *  call-seq:
06874  *     str.rpartition(sep)             -> [head, sep, tail]
06875  *     str.rpartition(regexp)          -> [head, match, tail]
06876  *
06877  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
06878  *  of the string, and returns the part before it, the match, and the part
06879  *  after it.
06880  *  If it is not found, returns two empty strings and <i>str</i>.
06881  *
06882  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
06883  *     "hello".rpartition("x")         #=> ["", "", "hello"]
06884  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
06885  */
06886 
06887 static VALUE
06888 rb_str_rpartition(VALUE str, VALUE sep)
06889 {
06890     long pos = RSTRING_LEN(str);
06891     int regex = FALSE;
06892 
06893     if (TYPE(sep) == T_REGEXP) {
06894         pos = rb_reg_search(sep, str, pos, 1);
06895         regex = TRUE;
06896     }
06897     else {
06898         VALUE tmp;
06899 
06900         tmp = rb_check_string_type(sep);
06901         if (NIL_P(tmp)) {
06902             rb_raise(rb_eTypeError, "type mismatch: %s given",
06903                      rb_obj_classname(sep));
06904         }
06905         sep = tmp;
06906         pos = rb_str_sublen(str, pos);
06907         pos = rb_str_rindex(str, sep, pos);
06908     }
06909     if (pos < 0) {
06910         return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
06911     }
06912     if (regex) {
06913         sep = rb_reg_nth_match(0, rb_backref_get());
06914     }
06915     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
06916                           sep,
06917                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
06918 }
06919 
06920 /*
06921  *  call-seq:
06922  *     str.start_with?([prefix]+)   -> true or false
06923  *
06924  *  Returns true if <i>str</i> starts with a prefix given.
06925  *
06926  *    p "hello".start_with?("hell")               #=> true
06927  *
06928  *    # returns true if one of prefix matches.
06929  *    p "hello".start_with?("heaven", "hell")     #=> true
06930  *    p "hello".start_with?("heaven", "paradice") #=> false
06931  *
06932  *
06933  *
06934  */
06935 
06936 static VALUE
06937 rb_str_start_with(int argc, VALUE *argv, VALUE str)
06938 {
06939     int i;
06940 
06941     for (i=0; i<argc; i++) {
06942         VALUE tmp = rb_check_string_type(argv[i]);
06943         if (NIL_P(tmp)) continue;
06944         rb_enc_check(str, tmp);
06945         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06946         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06947             return Qtrue;
06948     }
06949     return Qfalse;
06950 }
06951 
06952 /*
06953  *  call-seq:
06954  *     str.end_with?([suffix]+)   -> true or false
06955  *
06956  *  Returns true if <i>str</i> ends with a suffix given.
06957  */
06958 
06959 static VALUE
06960 rb_str_end_with(int argc, VALUE *argv, VALUE str)
06961 {
06962     int i;
06963     char *p, *s, *e;
06964     rb_encoding *enc;
06965 
06966     for (i=0; i<argc; i++) {
06967         VALUE tmp = rb_check_string_type(argv[i]);
06968         if (NIL_P(tmp)) continue;
06969         enc = rb_enc_check(str, tmp);
06970         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06971         p = RSTRING_PTR(str);
06972         e = p + RSTRING_LEN(str);
06973         s = e - RSTRING_LEN(tmp);
06974         if (rb_enc_left_char_head(p, s, e, enc) != s)
06975             continue;
06976         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06977             return Qtrue;
06978     }
06979     return Qfalse;
06980 }
06981 
06982 void
06983 rb_str_setter(VALUE val, ID id, VALUE *var)
06984 {
06985     if (!NIL_P(val) && TYPE(val) != T_STRING) {
06986         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
06987     }
06988     *var = val;
06989 }
06990 
06991 
06992 /*
06993  *  call-seq:
06994  *     str.force_encoding(encoding)   -> str
06995  *
06996  *  Changes the encoding to +encoding+ and returns self.
06997  */
06998 
06999 static VALUE
07000 rb_str_force_encoding(VALUE str, VALUE enc)
07001 {
07002     str_modifiable(str);
07003     rb_enc_associate(str, rb_to_encoding(enc));
07004     ENC_CODERANGE_CLEAR(str);
07005     return str;
07006 }
07007 
07008 /*
07009  *  call-seq:
07010  *     str.valid_encoding?  -> true or false
07011  *
07012  *  Returns true for a string which encoded correctly.
07013  *
07014  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
07015  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
07016  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
07017  */
07018 
07019 static VALUE
07020 rb_str_valid_encoding_p(VALUE str)
07021 {
07022     int cr = rb_enc_str_coderange(str);
07023 
07024     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07025 }
07026 
07027 /*
07028  *  call-seq:
07029  *     str.ascii_only?  -> true or false
07030  *
07031  *  Returns true for a string which has only ASCII characters.
07032  *
07033  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
07034  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
07035  */
07036 
07037 static VALUE
07038 rb_str_is_ascii_only_p(VALUE str)
07039 {
07040     int cr = rb_enc_str_coderange(str);
07041 
07042     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07043 }
07044 
07045 /**********************************************************************
07046  * Document-class: Symbol
07047  *
07048  *  <code>Symbol</code> objects represent names and some strings
07049  *  inside the Ruby
07050  *  interpreter. They are generated using the <code>:name</code> and
07051  *  <code>:"string"</code> literals
07052  *  syntax, and by the various <code>to_sym</code> methods. The same
07053  *  <code>Symbol</code> object will be created for a given name or string
07054  *  for the duration of a program's execution, regardless of the context
07055  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
07056  *  one context, a method in another, and a class in a third, the
07057  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
07058  *  all three contexts.
07059  *
07060  *     module One
07061  *       class Fred
07062  *       end
07063  *       $f1 = :Fred
07064  *     end
07065  *     module Two
07066  *       Fred = 1
07067  *       $f2 = :Fred
07068  *     end
07069  *     def Fred()
07070  *     end
07071  *     $f3 = :Fred
07072  *     $f1.object_id   #=> 2514190
07073  *     $f2.object_id   #=> 2514190
07074  *     $f3.object_id   #=> 2514190
07075  *
07076  */
07077 
07078 
07079 /*
07080  *  call-seq:
07081  *     sym == obj   -> true or false
07082  *
07083  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
07084  *  symbol, returns <code>true</code>.
07085  */
07086 
07087 static VALUE
07088 sym_equal(VALUE sym1, VALUE sym2)
07089 {
07090     if (sym1 == sym2) return Qtrue;
07091     return Qfalse;
07092 }
07093 
07094 
07095 static int
07096 sym_printable(const char *s, const char *send, rb_encoding *enc)
07097 {
07098     while (s < send) {
07099         int n;
07100         int c = rb_enc_codepoint_len(s, send, &n, enc);
07101 
07102         if (!rb_enc_isprint(c, enc)) return FALSE;
07103         s += n;
07104     }
07105     return TRUE;
07106 }
07107 
07108 /*
07109  *  call-seq:
07110  *     sym.inspect    -> string
07111  *
07112  *  Returns the representation of <i>sym</i> as a symbol literal.
07113  *
07114  *     :fred.inspect   #=> ":fred"
07115  */
07116 
07117 static VALUE
07118 sym_inspect(VALUE sym)
07119 {
07120     VALUE str;
07121     ID id = SYM2ID(sym);
07122     rb_encoding *enc;
07123     const char *ptr;
07124     long len;
07125     char *dest;
07126     rb_encoding *resenc = rb_default_internal_encoding();
07127 
07128     if (resenc == NULL) resenc = rb_default_external_encoding();
07129     sym = rb_id2str(id);
07130     enc = STR_ENC_GET(sym);
07131     ptr = RSTRING_PTR(sym);
07132     len = RSTRING_LEN(sym);
07133     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07134         !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07135         str = rb_str_inspect(sym);
07136         len = RSTRING_LEN(str);
07137         rb_str_resize(str, len + 1);
07138         dest = RSTRING_PTR(str);
07139         memmove(dest + 1, dest, len);
07140         dest[0] = ':';
07141     }
07142     else {
07143         char *dest;
07144         str = rb_enc_str_new(0, len + 1, enc);
07145         dest = RSTRING_PTR(str);
07146         dest[0] = ':';
07147         memcpy(dest + 1, ptr, len);
07148     }
07149     return str;
07150 }
07151 
07152 
07153 /*
07154  *  call-seq:
07155  *     sym.id2name   -> string
07156  *     sym.to_s      -> string
07157  *
07158  *  Returns the name or string corresponding to <i>sym</i>.
07159  *
07160  *     :fred.id2name   #=> "fred"
07161  */
07162 
07163 
07164 VALUE
07165 rb_sym_to_s(VALUE sym)
07166 {
07167     ID id = SYM2ID(sym);
07168 
07169     return str_new3(rb_cString, rb_id2str(id));
07170 }
07171 
07172 
07173 /*
07174  * call-seq:
07175  *   sym.to_sym   -> sym
07176  *   sym.intern   -> sym
07177  *
07178  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
07179  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
07180  * in this case.
07181  */
07182 
07183 static VALUE
07184 sym_to_sym(VALUE sym)
07185 {
07186     return sym;
07187 }
07188 
07189 VALUE rb_funcall_passing_block(VALUE recv, ID mid, int argc, const VALUE *argv);
07190 
07191 static VALUE
07192 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
07193 {
07194     VALUE obj;
07195 
07196     if (argc < 1) {
07197         rb_raise(rb_eArgError, "no receiver given");
07198     }
07199     obj = argv[0];
07200     return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1);
07201 }
07202 
07203 /*
07204  * call-seq:
07205  *   sym.to_proc
07206  *
07207  * Returns a _Proc_ object which respond to the given method by _sym_.
07208  *
07209  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
07210  */
07211 
07212 static VALUE
07213 sym_to_proc(VALUE sym)
07214 {
07215     static VALUE sym_proc_cache = Qfalse;
07216     enum {SYM_PROC_CACHE_SIZE = 67};
07217     VALUE proc;
07218     long id, index;
07219     VALUE *aryp;
07220 
07221     if (!sym_proc_cache) {
07222         sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07223         rb_gc_register_mark_object(sym_proc_cache);
07224         rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07225     }
07226 
07227     id = SYM2ID(sym);
07228     index = (id % SYM_PROC_CACHE_SIZE) << 1;
07229 
07230     aryp = RARRAY_PTR(sym_proc_cache);
07231     if (aryp[index] == sym) {
07232         return aryp[index + 1];
07233     }
07234     else {
07235         proc = rb_proc_new(sym_call, (VALUE)id);
07236         aryp[index] = sym;
07237         aryp[index + 1] = proc;
07238         return proc;
07239     }
07240 }
07241 
07242 /*
07243  * call-seq:
07244  *
07245  *   sym.succ
07246  *
07247  * Same as <code>sym.to_s.succ.intern</code>.
07248  */
07249 
07250 static VALUE
07251 sym_succ(VALUE sym)
07252 {
07253     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07254 }
07255 
07256 /*
07257  * call-seq:
07258  *
07259  *   str <=> other       -> -1, 0, +1 or nil
07260  *
07261  * Compares _sym_ with _other_ in string form.
07262  */
07263 
07264 static VALUE
07265 sym_cmp(VALUE sym, VALUE other)
07266 {
07267     if (!SYMBOL_P(other)) {
07268         return Qnil;
07269     }
07270     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07271 }
07272 
07273 /*
07274  * call-seq:
07275  *
07276  *   sym.casecmp(other)  -> -1, 0, +1 or nil
07277  *
07278  * Case-insensitive version of <code>Symbol#<=></code>.
07279  */
07280 
07281 static VALUE
07282 sym_casecmp(VALUE sym, VALUE other)
07283 {
07284     if (!SYMBOL_P(other)) {
07285         return Qnil;
07286     }
07287     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07288 }
07289 
07290 /*
07291  * call-seq:
07292  *   sym =~ obj   -> fixnum or nil
07293  *
07294  * Returns <code>sym.to_s =~ obj</code>.
07295  */
07296 
07297 static VALUE
07298 sym_match(VALUE sym, VALUE other)
07299 {
07300     return rb_str_match(rb_sym_to_s(sym), other);
07301 }
07302 
07303 /*
07304  * call-seq:
07305  *   sym[idx]      -> char
07306  *   sym[b, n]     -> char
07307  *
07308  * Returns <code>sym.to_s[]</code>.
07309  */
07310 
07311 static VALUE
07312 sym_aref(int argc, VALUE *argv, VALUE sym)
07313 {
07314     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07315 }
07316 
07317 /*
07318  * call-seq:
07319  *   sym.length    -> integer
07320  *
07321  * Same as <code>sym.to_s.length</code>.
07322  */
07323 
07324 static VALUE
07325 sym_length(VALUE sym)
07326 {
07327     return rb_str_length(rb_id2str(SYM2ID(sym)));
07328 }
07329 
07330 /*
07331  * call-seq:
07332  *   sym.empty?   -> true or false
07333  *
07334  * Returns that _sym_ is :"" or not.
07335  */
07336 
07337 static VALUE
07338 sym_empty(VALUE sym)
07339 {
07340     return rb_str_empty(rb_id2str(SYM2ID(sym)));
07341 }
07342 
07343 /*
07344  * call-seq:
07345  *   sym.upcase    -> symbol
07346  *
07347  * Same as <code>sym.to_s.upcase.intern</code>.
07348  */
07349 
07350 static VALUE
07351 sym_upcase(VALUE sym)
07352 {
07353     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07354 }
07355 
07356 /*
07357  * call-seq:
07358  *   sym.downcase  -> symbol
07359  *
07360  * Same as <code>sym.to_s.downcase.intern</code>.
07361  */
07362 
07363 static VALUE
07364 sym_downcase(VALUE sym)
07365 {
07366     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07367 }
07368 
07369 /*
07370  * call-seq:
07371  *   sym.capitalize  -> symbol
07372  *
07373  * Same as <code>sym.to_s.capitalize.intern</code>.
07374  */
07375 
07376 static VALUE
07377 sym_capitalize(VALUE sym)
07378 {
07379     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07380 }
07381 
07382 /*
07383  * call-seq:
07384  *   sym.swapcase  -> symbol
07385  *
07386  * Same as <code>sym.to_s.swapcase.intern</code>.
07387  */
07388 
07389 static VALUE
07390 sym_swapcase(VALUE sym)
07391 {
07392     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07393 }
07394 
07395 /*
07396  * call-seq:
07397  *   sym.encoding   -> encoding
07398  *
07399  * Returns the Encoding object that represents the encoding of _sym_.
07400  */
07401 
07402 static VALUE
07403 sym_encoding(VALUE sym)
07404 {
07405     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07406 }
07407 
07408 ID
07409 rb_to_id(VALUE name)
07410 {
07411     VALUE tmp;
07412     ID id;
07413 
07414     switch (TYPE(name)) {
07415       default:
07416         tmp = rb_check_string_type(name);
07417         if (NIL_P(tmp)) {
07418             tmp = rb_inspect(name);
07419             rb_raise(rb_eTypeError, "%s is not a symbol",
07420                      RSTRING_PTR(tmp));
07421         }
07422         name = tmp;
07423         /* fall through */
07424       case T_STRING:
07425         name = rb_str_intern(name);
07426         /* fall through */
07427       case T_SYMBOL:
07428         return SYM2ID(name);
07429     }
07430     return id;
07431 }
07432 
07433 /*
07434  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
07435  *  bytes, typically representing characters. String objects may be created
07436  *  using <code>String::new</code> or as literals.
07437  *
07438  *  Because of aliasing issues, users of strings should be aware of the methods
07439  *  that modify the contents of a <code>String</code> object.  Typically,
07440  *  methods with names ending in ``!'' modify their receiver, while those
07441  *  without a ``!'' return a new <code>String</code>.  However, there are
07442  *  exceptions, such as <code>String#[]=</code>.
07443  *
07444  */
07445 
07446 void
07447 Init_String(void)
07448 {
07449 #undef rb_intern
07450 #define rb_intern(str) rb_intern_const(str)
07451 
07452     rb_cString  = rb_define_class("String", rb_cObject);
07453     rb_include_module(rb_cString, rb_mComparable);
07454     rb_define_alloc_func(rb_cString, str_alloc);
07455     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07456     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07457     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07458     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07459     rb_define_method(rb_cString, "==", rb_str_equal, 1);
07460     rb_define_method(rb_cString, "===", rb_str_equal, 1);
07461     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07462     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07463     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07464     rb_define_method(rb_cString, "+", rb_str_plus, 1);
07465     rb_define_method(rb_cString, "*", rb_str_times, 1);
07466     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07467     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07468     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07469     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07470     rb_define_method(rb_cString, "length", rb_str_length, 0);
07471     rb_define_method(rb_cString, "size", rb_str_length, 0);
07472     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07473     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07474     rb_define_method(rb_cString, "=~", rb_str_match, 1);
07475     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07476     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07477     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07478     rb_define_method(rb_cString, "next", rb_str_succ, 0);
07479     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07480     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07481     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07482     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07483     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07484     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07485     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07486     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07487     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07488 
07489     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07490     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07491     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07492     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07493     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07494     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07495 
07496     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07497     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07498     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07499     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07500 
07501     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07502     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07503     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07504     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07505 
07506     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07507     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07508     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07509     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07510     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07511     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07512     rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07513     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07514     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07515     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07516     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07517     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07518     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07519     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07520     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07521 
07522     rb_define_method(rb_cString, "include?", rb_str_include, 1);
07523     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07524     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07525 
07526     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07527 
07528     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07529     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07530     rb_define_method(rb_cString, "center", rb_str_center, -1);
07531 
07532     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07533     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07534     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07535     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07536     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07537     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07538     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07539 
07540     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07541     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07542     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07543     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07544     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07545     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07546     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07547 
07548     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07549     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07550     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07551     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07552     rb_define_method(rb_cString, "count", rb_str_count, -1);
07553 
07554     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07555     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07556     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07557     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07558 
07559     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07560     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07561     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07562     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07563 
07564     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07565 
07566     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07567     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07568 
07569     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07570     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07571 
07572     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
07573     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07574     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07575     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07576 
07577     id_to_s = rb_intern("to_s");
07578 
07579     rb_fs = Qnil;
07580     rb_define_variable("$;", &rb_fs);
07581     rb_define_variable("$-F", &rb_fs);
07582 
07583     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07584     rb_include_module(rb_cSymbol, rb_mComparable);
07585     rb_undef_alloc_func(rb_cSymbol);
07586     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07587     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
07588 
07589     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07590     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07591     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07592     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07593     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07594     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07595     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07596     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07597     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07598     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07599 
07600     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07601     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07602     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07603 
07604     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07605     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07606     rb_define_method(rb_cSymbol, "length", sym_length, 0);
07607     rb_define_method(rb_cSymbol, "size", sym_length, 0);
07608     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07609     rb_define_method(rb_cSymbol, "match", sym_match, 1);
07610 
07611     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07612     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07613     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07614     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07615 
07616     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07617 }
07618 

Generated on Sat Jul 7 2012 15:29:24 for Ruby by  doxygen 1.7.1