00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include <assert.h>
00018
00019 #define BEG(no) regs->beg[no]
00020 #define END(no) regs->end[no]
00021
00022 #include <math.h>
00023 #include <ctype.h>
00024
00025 #ifdef HAVE_UNISTD_H
00026 #include <unistd.h>
00027 #endif
00028
00029 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00030
00031 #undef rb_str_new_cstr
00032 #undef rb_tainted_str_new_cstr
00033 #undef rb_usascii_str_new_cstr
00034 #undef rb_external_str_new_cstr
00035 #undef rb_locale_str_new_cstr
00036 #undef rb_str_new2
00037 #undef rb_str_new3
00038 #undef rb_str_new4
00039 #undef rb_str_new5
00040 #undef rb_tainted_str_new2
00041 #undef rb_usascii_str_new2
00042 #undef rb_str_dup_frozen
00043 #undef rb_str_buf_new_cstr
00044 #undef rb_str_buf_new2
00045 #undef rb_str_buf_cat2
00046 #undef rb_str_cat2
00047
00048 VALUE rb_cString;
00049 VALUE rb_cSymbol;
00050
00051 #define RUBY_MAX_CHAR_LEN 16
00052 #define STR_TMPLOCK FL_USER7
00053 #define STR_NOEMBED FL_USER1
00054 #define STR_SHARED FL_USER2
00055 #define STR_ASSOC FL_USER3
00056 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
00057 #define STR_ASSOC_P(s) FL_ALL(s, STR_NOEMBED|STR_ASSOC)
00058 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00059 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
00060 #define STR_UNSET_NOCAPA(s) do {\
00061 if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
00062 } while (0)
00063
00064
00065 #define STR_SET_NOEMBED(str) do {\
00066 FL_SET(str, STR_NOEMBED);\
00067 STR_SET_EMBED_LEN(str, 0);\
00068 } while (0)
00069 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
00070 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
00071 #define STR_SET_EMBED_LEN(str, n) do { \
00072 long tmp_n = (n);\
00073 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00074 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00075 } while (0)
00076
00077 #define STR_SET_LEN(str, n) do { \
00078 if (STR_EMBED_P(str)) {\
00079 STR_SET_EMBED_LEN(str, n);\
00080 }\
00081 else {\
00082 RSTRING(str)->as.heap.len = (n);\
00083 }\
00084 } while (0)
00085
00086 #define STR_DEC_LEN(str) do {\
00087 if (STR_EMBED_P(str)) {\
00088 long n = RSTRING_LEN(str);\
00089 n--;\
00090 STR_SET_EMBED_LEN(str, n);\
00091 }\
00092 else {\
00093 RSTRING(str)->as.heap.len--;\
00094 }\
00095 } while (0)
00096
00097 #define RESIZE_CAPA(str,capacity) do {\
00098 if (STR_EMBED_P(str)) {\
00099 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00100 char *tmp = ALLOC_N(char, capacity+1);\
00101 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00102 RSTRING(str)->as.heap.ptr = tmp;\
00103 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00104 STR_SET_NOEMBED(str);\
00105 RSTRING(str)->as.heap.aux.capa = (capacity);\
00106 }\
00107 }\
00108 else {\
00109 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00110 if (!STR_NOCAPA_P(str))\
00111 RSTRING(str)->as.heap.aux.capa = (capacity);\
00112 }\
00113 } while (0)
00114
00115 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00116 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00117
00118 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00119
00120 static inline int
00121 single_byte_optimizable(VALUE str)
00122 {
00123 rb_encoding *enc;
00124
00125
00126 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00127 return 1;
00128
00129 enc = STR_ENC_GET(str);
00130 if (rb_enc_mbmaxlen(enc) == 1)
00131 return 1;
00132
00133
00134
00135 return 0;
00136 }
00137
00138 VALUE rb_fs;
00139
00140 static inline const char *
00141 search_nonascii(const char *p, const char *e)
00142 {
00143 #if SIZEOF_VALUE == 8
00144 # define NONASCII_MASK 0x8080808080808080ULL
00145 #elif SIZEOF_VALUE == 4
00146 # define NONASCII_MASK 0x80808080UL
00147 #endif
00148 #ifdef NONASCII_MASK
00149 if ((int)sizeof(VALUE) * 2 < e - p) {
00150 const VALUE *s, *t;
00151 const VALUE lowbits = sizeof(VALUE) - 1;
00152 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00153 while (p < (const char *)s) {
00154 if (!ISASCII(*p))
00155 return p;
00156 p++;
00157 }
00158 t = (const VALUE*)(~lowbits & (VALUE)e);
00159 while (s < t) {
00160 if (*s & NONASCII_MASK) {
00161 t = s;
00162 break;
00163 }
00164 s++;
00165 }
00166 p = (const char *)t;
00167 }
00168 #endif
00169 while (p < e) {
00170 if (!ISASCII(*p))
00171 return p;
00172 p++;
00173 }
00174 return NULL;
00175 }
00176
00177 static int
00178 coderange_scan(const char *p, long len, rb_encoding *enc)
00179 {
00180 const char *e = p + len;
00181
00182 if (rb_enc_to_index(enc) == 0) {
00183
00184 p = search_nonascii(p, e);
00185 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00186 }
00187
00188 if (rb_enc_asciicompat(enc)) {
00189 p = search_nonascii(p, e);
00190 if (!p) {
00191 return ENC_CODERANGE_7BIT;
00192 }
00193 while (p < e) {
00194 int ret = rb_enc_precise_mbclen(p, e, enc);
00195 if (!MBCLEN_CHARFOUND_P(ret)) {
00196 return ENC_CODERANGE_BROKEN;
00197 }
00198 p += MBCLEN_CHARFOUND_LEN(ret);
00199 if (p < e) {
00200 p = search_nonascii(p, e);
00201 if (!p) {
00202 return ENC_CODERANGE_VALID;
00203 }
00204 }
00205 }
00206 if (e < p) {
00207 return ENC_CODERANGE_BROKEN;
00208 }
00209 return ENC_CODERANGE_VALID;
00210 }
00211
00212 while (p < e) {
00213 int ret = rb_enc_precise_mbclen(p, e, enc);
00214
00215 if (!MBCLEN_CHARFOUND_P(ret)) {
00216 return ENC_CODERANGE_BROKEN;
00217 }
00218 p += MBCLEN_CHARFOUND_LEN(ret);
00219 }
00220 if (e < p) {
00221 return ENC_CODERANGE_BROKEN;
00222 }
00223 return ENC_CODERANGE_VALID;
00224 }
00225
00226 long
00227 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00228 {
00229 const char *p = s;
00230
00231 if (*cr == ENC_CODERANGE_BROKEN)
00232 return e - s;
00233
00234 if (rb_enc_to_index(enc) == 0) {
00235
00236 p = search_nonascii(p, e);
00237 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00238 return e - s;
00239 }
00240 else if (rb_enc_asciicompat(enc)) {
00241 p = search_nonascii(p, e);
00242 if (!p) {
00243 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00244 return e - s;
00245 }
00246 while (p < e) {
00247 int ret = rb_enc_precise_mbclen(p, e, enc);
00248 if (!MBCLEN_CHARFOUND_P(ret)) {
00249 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00250 return p - s;
00251 }
00252 p += MBCLEN_CHARFOUND_LEN(ret);
00253 if (p < e) {
00254 p = search_nonascii(p, e);
00255 if (!p) {
00256 *cr = ENC_CODERANGE_VALID;
00257 return e - s;
00258 }
00259 }
00260 }
00261 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00262 return p - s;
00263 }
00264 else {
00265 while (p < e) {
00266 int ret = rb_enc_precise_mbclen(p, e, enc);
00267 if (!MBCLEN_CHARFOUND_P(ret)) {
00268 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00269 return p - s;
00270 }
00271 p += MBCLEN_CHARFOUND_LEN(ret);
00272 }
00273 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00274 return p - s;
00275 }
00276 }
00277
00278 static inline void
00279 str_enc_copy(VALUE str1, VALUE str2)
00280 {
00281 rb_enc_set_index(str1, ENCODING_GET(str2));
00282 }
00283
00284 static void
00285 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00286 {
00287
00288
00289
00290 str_enc_copy(dest, src);
00291 switch (ENC_CODERANGE(src)) {
00292 case ENC_CODERANGE_7BIT:
00293 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00294 break;
00295 case ENC_CODERANGE_VALID:
00296 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00297 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00298 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00299 else
00300 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00301 break;
00302 default:
00303 if (RSTRING_LEN(dest) == 0) {
00304 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00305 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00306 else
00307 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00308 }
00309 break;
00310 }
00311 }
00312
00313 static void
00314 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00315 {
00316 str_enc_copy(dest, src);
00317 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00318 }
00319
00320 int
00321 rb_enc_str_coderange(VALUE str)
00322 {
00323 int cr = ENC_CODERANGE(str);
00324
00325 if (cr == ENC_CODERANGE_UNKNOWN) {
00326 rb_encoding *enc = STR_ENC_GET(str);
00327 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00328 ENC_CODERANGE_SET(str, cr);
00329 }
00330 return cr;
00331 }
00332
00333 int
00334 rb_enc_str_asciionly_p(VALUE str)
00335 {
00336 rb_encoding *enc = STR_ENC_GET(str);
00337
00338 if (!rb_enc_asciicompat(enc))
00339 return FALSE;
00340 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00341 return TRUE;
00342 return FALSE;
00343 }
00344
00345 static inline void
00346 str_mod_check(VALUE s, const char *p, long len)
00347 {
00348 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00349 rb_raise(rb_eRuntimeError, "string modified");
00350 }
00351 }
00352
00353 static inline void
00354 str_frozen_check(VALUE s)
00355 {
00356 if (OBJ_FROZEN(s)) {
00357 rb_raise(rb_eRuntimeError, "string frozen");
00358 }
00359 }
00360
00361 size_t
00362 rb_str_capacity(VALUE str)
00363 {
00364 if (STR_EMBED_P(str)) {
00365 return RSTRING_EMBED_LEN_MAX;
00366 }
00367 else if (STR_NOCAPA_P(str)) {
00368 return RSTRING(str)->as.heap.len;
00369 }
00370 else {
00371 return RSTRING(str)->as.heap.aux.capa;
00372 }
00373 }
00374
00375 static inline VALUE
00376 str_alloc(VALUE klass)
00377 {
00378 NEWOBJ(str, struct RString);
00379 OBJSETUP(str, klass, T_STRING);
00380
00381 str->as.heap.ptr = 0;
00382 str->as.heap.len = 0;
00383 str->as.heap.aux.capa = 0;
00384
00385 return (VALUE)str;
00386 }
00387
00388 static VALUE
00389 str_new(VALUE klass, const char *ptr, long len)
00390 {
00391 VALUE str;
00392
00393 if (len < 0) {
00394 rb_raise(rb_eArgError, "negative string size (or size too big)");
00395 }
00396
00397 str = str_alloc(klass);
00398 if (len > RSTRING_EMBED_LEN_MAX) {
00399 RSTRING(str)->as.heap.aux.capa = len;
00400 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00401 STR_SET_NOEMBED(str);
00402 }
00403 else if (len == 0) {
00404 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00405 }
00406 if (ptr) {
00407 memcpy(RSTRING_PTR(str), ptr, len);
00408 }
00409 STR_SET_LEN(str, len);
00410 RSTRING_PTR(str)[len] = '\0';
00411 return str;
00412 }
00413
00414 VALUE
00415 rb_str_new(const char *ptr, long len)
00416 {
00417 return str_new(rb_cString, ptr, len);
00418 }
00419
00420 VALUE
00421 rb_usascii_str_new(const char *ptr, long len)
00422 {
00423 VALUE str = rb_str_new(ptr, len);
00424 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00425 return str;
00426 }
00427
00428 VALUE
00429 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00430 {
00431 VALUE str = rb_str_new(ptr, len);
00432 rb_enc_associate(str, enc);
00433 return str;
00434 }
00435
00436 VALUE
00437 rb_str_new_cstr(const char *ptr)
00438 {
00439 if (!ptr) {
00440 rb_raise(rb_eArgError, "NULL pointer given");
00441 }
00442 return rb_str_new(ptr, strlen(ptr));
00443 }
00444
00445 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00446 #define rb_str_new2 rb_str_new_cstr
00447
00448 VALUE
00449 rb_usascii_str_new_cstr(const char *ptr)
00450 {
00451 VALUE str = rb_str_new2(ptr);
00452 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00453 return str;
00454 }
00455
00456 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00457 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00458
00459 VALUE
00460 rb_tainted_str_new(const char *ptr, long len)
00461 {
00462 VALUE str = rb_str_new(ptr, len);
00463
00464 OBJ_TAINT(str);
00465 return str;
00466 }
00467
00468 VALUE
00469 rb_tainted_str_new_cstr(const char *ptr)
00470 {
00471 VALUE str = rb_str_new2(ptr);
00472
00473 OBJ_TAINT(str);
00474 return str;
00475 }
00476
00477 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00478 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00479
00480 VALUE
00481 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00482 {
00483 rb_econv_t *ec;
00484 rb_econv_result_t ret;
00485 long len;
00486 VALUE newstr;
00487 const unsigned char *sp;
00488 unsigned char *dp;
00489
00490 if (!to) return str;
00491 if (from == to) return str;
00492 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00493 to == rb_ascii8bit_encoding()) {
00494 if (STR_ENC_GET(str) != to) {
00495 str = rb_str_dup(str);
00496 rb_enc_associate(str, to);
00497 }
00498 return str;
00499 }
00500
00501 len = RSTRING_LEN(str);
00502 newstr = rb_str_new(0, len);
00503
00504 retry:
00505 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00506 if (!ec) return str;
00507
00508 sp = (unsigned char*)RSTRING_PTR(str);
00509 dp = (unsigned char*)RSTRING_PTR(newstr);
00510 ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00511 &dp, (unsigned char*)RSTRING_END(newstr), 0);
00512 rb_econv_close(ec);
00513 switch (ret) {
00514 case econv_destination_buffer_full:
00515
00516 len = len < 2 ? 2 : len * 2;
00517 rb_str_resize(newstr, len);
00518 goto retry;
00519
00520 case econv_finished:
00521 len = dp - (unsigned char*)RSTRING_PTR(newstr);
00522 rb_str_set_len(newstr, len);
00523 rb_enc_associate(newstr, to);
00524 return newstr;
00525
00526 default:
00527
00528 return str;
00529 }
00530 }
00531
00532 VALUE
00533 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00534 {
00535 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00536 }
00537
00538 VALUE
00539 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00540 {
00541 VALUE str;
00542
00543 str = rb_tainted_str_new(ptr, len);
00544 if (eenc == rb_usascii_encoding() &&
00545 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00546 rb_enc_associate(str, rb_ascii8bit_encoding());
00547 return str;
00548 }
00549 rb_enc_associate(str, eenc);
00550 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00551 }
00552
00553 VALUE
00554 rb_external_str_new(const char *ptr, long len)
00555 {
00556 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00557 }
00558
00559 VALUE
00560 rb_external_str_new_cstr(const char *ptr)
00561 {
00562 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00563 }
00564
00565 VALUE
00566 rb_locale_str_new(const char *ptr, long len)
00567 {
00568 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00569 }
00570
00571 VALUE
00572 rb_locale_str_new_cstr(const char *ptr)
00573 {
00574 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00575 }
00576
00577 VALUE
00578 rb_filesystem_str_new(const char *ptr, long len)
00579 {
00580 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00581 }
00582
00583 VALUE
00584 rb_filesystem_str_new_cstr(const char *ptr)
00585 {
00586 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00587 }
00588
00589 VALUE
00590 rb_str_export(VALUE str)
00591 {
00592 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00593 }
00594
00595 VALUE
00596 rb_str_export_locale(VALUE str)
00597 {
00598 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00599 }
00600
00601 VALUE
00602 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00603 {
00604 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00605 }
00606
00607 static VALUE
00608 str_replace_shared(VALUE str2, VALUE str)
00609 {
00610 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00611 STR_SET_EMBED(str2);
00612 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00613 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00614 }
00615 else {
00616 str = rb_str_new_frozen(str);
00617 FL_SET(str2, STR_NOEMBED);
00618 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00619 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00620 RSTRING(str2)->as.heap.aux.shared = str;
00621 FL_SET(str2, ELTS_SHARED);
00622 }
00623 rb_enc_cr_str_exact_copy(str2, str);
00624
00625 return str2;
00626 }
00627
00628 static VALUE
00629 str_new_shared(VALUE klass, VALUE str)
00630 {
00631 return str_replace_shared(str_alloc(klass), str);
00632 }
00633
00634 static VALUE
00635 str_new3(VALUE klass, VALUE str)
00636 {
00637 return str_new_shared(klass, str);
00638 }
00639
00640 VALUE
00641 rb_str_new_shared(VALUE str)
00642 {
00643 VALUE str2 = str_new3(rb_obj_class(str), str);
00644
00645 OBJ_INFECT(str2, str);
00646 return str2;
00647 }
00648
00649 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00650 #define rb_str_new3 rb_str_new_shared
00651
00652 static VALUE
00653 str_new4(VALUE klass, VALUE str)
00654 {
00655 VALUE str2;
00656
00657 str2 = str_alloc(klass);
00658 STR_SET_NOEMBED(str2);
00659 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00660 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00661 if (STR_SHARED_P(str)) {
00662 VALUE shared = RSTRING(str)->as.heap.aux.shared;
00663 assert(OBJ_FROZEN(shared));
00664 FL_SET(str2, ELTS_SHARED);
00665 RSTRING(str2)->as.heap.aux.shared = shared;
00666 }
00667 else {
00668 FL_SET(str, ELTS_SHARED);
00669 RSTRING(str)->as.heap.aux.shared = str2;
00670 }
00671 rb_enc_cr_str_exact_copy(str2, str);
00672 OBJ_INFECT(str2, str);
00673 return str2;
00674 }
00675
00676 VALUE
00677 rb_str_new_frozen(VALUE orig)
00678 {
00679 VALUE klass, str;
00680
00681 if (OBJ_FROZEN(orig)) return orig;
00682 klass = rb_obj_class(orig);
00683 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00684 long ofs;
00685 assert(OBJ_FROZEN(str));
00686 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00687 if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00688 (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00689 ENCODING_GET(str) != ENCODING_GET(orig)) {
00690 str = str_new3(klass, str);
00691 RSTRING(str)->as.heap.ptr += ofs;
00692 RSTRING(str)->as.heap.len -= ofs;
00693 rb_enc_cr_str_exact_copy(str, orig);
00694 OBJ_INFECT(str, orig);
00695 }
00696 }
00697 else if (STR_EMBED_P(orig)) {
00698 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00699 rb_enc_cr_str_exact_copy(str, orig);
00700 OBJ_INFECT(str, orig);
00701 }
00702 else if (STR_ASSOC_P(orig)) {
00703 VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00704 FL_UNSET(orig, STR_ASSOC);
00705 str = str_new4(klass, orig);
00706 FL_SET(str, STR_ASSOC);
00707 RSTRING(str)->as.heap.aux.shared = assoc;
00708 }
00709 else {
00710 str = str_new4(klass, orig);
00711 }
00712 OBJ_FREEZE(str);
00713 return str;
00714 }
00715
00716 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00717 #define rb_str_new4 rb_str_new_frozen
00718
00719 VALUE
00720 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00721 {
00722 return str_new(rb_obj_class(obj), ptr, len);
00723 }
00724
00725 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00726 rb_str_new_with_class, (obj, ptr, len))
00727 #define rb_str_new5 rb_str_new_with_class
00728
00729 static VALUE
00730 str_new_empty(VALUE str)
00731 {
00732 VALUE v = rb_str_new5(str, 0, 0);
00733 OBJ_INFECT(v, str);
00734 return v;
00735 }
00736
00737 #define STR_BUF_MIN_SIZE 128
00738
00739 VALUE
00740 rb_str_buf_new(long capa)
00741 {
00742 VALUE str = str_alloc(rb_cString);
00743
00744 if (capa < STR_BUF_MIN_SIZE) {
00745 capa = STR_BUF_MIN_SIZE;
00746 }
00747 FL_SET(str, STR_NOEMBED);
00748 RSTRING(str)->as.heap.aux.capa = capa;
00749 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00750 RSTRING(str)->as.heap.ptr[0] = '\0';
00751
00752 return str;
00753 }
00754
00755 VALUE
00756 rb_str_buf_new_cstr(const char *ptr)
00757 {
00758 VALUE str;
00759 long len = strlen(ptr);
00760
00761 str = rb_str_buf_new(len);
00762 rb_str_buf_cat(str, ptr, len);
00763
00764 return str;
00765 }
00766
00767 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00768 #define rb_str_buf_new2 rb_str_buf_new_cstr
00769
00770 VALUE
00771 rb_str_tmp_new(long len)
00772 {
00773 return str_new(0, 0, len);
00774 }
00775
00776 void
00777 rb_str_free(VALUE str)
00778 {
00779 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00780 xfree(RSTRING(str)->as.heap.ptr);
00781 }
00782 }
00783
00784 size_t
00785 rb_str_memsize(VALUE str)
00786 {
00787 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00788 return RSTRING(str)->as.heap.aux.capa;
00789 }
00790 else {
00791 return 0;
00792 }
00793 }
00794
00795 VALUE
00796 rb_str_to_str(VALUE str)
00797 {
00798 return rb_convert_type(str, T_STRING, "String", "to_str");
00799 }
00800
00801 static inline void str_discard(VALUE str);
00802
00803 void
00804 rb_str_shared_replace(VALUE str, VALUE str2)
00805 {
00806 rb_encoding *enc;
00807 int cr;
00808 if (str == str2) return;
00809 enc = STR_ENC_GET(str2);
00810 cr = ENC_CODERANGE(str2);
00811 str_discard(str);
00812 OBJ_INFECT(str, str2);
00813 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00814 STR_SET_EMBED(str);
00815 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00816 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00817 rb_enc_associate(str, enc);
00818 ENC_CODERANGE_SET(str, cr);
00819 return;
00820 }
00821 STR_SET_NOEMBED(str);
00822 STR_UNSET_NOCAPA(str);
00823 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00824 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00825 if (STR_NOCAPA_P(str2)) {
00826 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00827 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00828 }
00829 else {
00830 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00831 }
00832 STR_SET_EMBED(str2);
00833 RSTRING_PTR(str2)[0] = 0;
00834 STR_SET_EMBED_LEN(str2, 0);
00835 rb_enc_associate(str, enc);
00836 ENC_CODERANGE_SET(str, cr);
00837 }
00838
00839 static ID id_to_s;
00840
00841 VALUE
00842 rb_obj_as_string(VALUE obj)
00843 {
00844 VALUE str;
00845
00846 if (TYPE(obj) == T_STRING) {
00847 return obj;
00848 }
00849 str = rb_funcall(obj, id_to_s, 0);
00850 if (TYPE(str) != T_STRING)
00851 return rb_any_to_s(obj);
00852 if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00853 return str;
00854 }
00855
00856 static VALUE
00857 str_replace(VALUE str, VALUE str2)
00858 {
00859 long len;
00860
00861 len = RSTRING_LEN(str2);
00862 if (STR_ASSOC_P(str2)) {
00863 str2 = rb_str_new4(str2);
00864 }
00865 if (STR_SHARED_P(str2)) {
00866 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00867 assert(OBJ_FROZEN(shared));
00868 STR_SET_NOEMBED(str);
00869 RSTRING(str)->as.heap.len = len;
00870 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00871 FL_SET(str, ELTS_SHARED);
00872 FL_UNSET(str, STR_ASSOC);
00873 RSTRING(str)->as.heap.aux.shared = shared;
00874 }
00875 else {
00876 str_replace_shared(str, str2);
00877 }
00878
00879 OBJ_INFECT(str, str2);
00880 rb_enc_cr_str_exact_copy(str, str2);
00881 return str;
00882 }
00883
00884 static VALUE
00885 str_duplicate(VALUE klass, VALUE str)
00886 {
00887 VALUE dup = str_alloc(klass);
00888 str_replace(dup, str);
00889 return dup;
00890 }
00891
00892 VALUE
00893 rb_str_dup(VALUE str)
00894 {
00895 return str_duplicate(rb_obj_class(str), str);
00896 }
00897
00898 VALUE
00899 rb_str_resurrect(VALUE str)
00900 {
00901 return str_replace(str_alloc(rb_cString), str);
00902 }
00903
00904
00905
00906
00907
00908
00909
00910
00911 static VALUE
00912 rb_str_init(int argc, VALUE *argv, VALUE str)
00913 {
00914 VALUE orig;
00915
00916 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00917 rb_str_replace(str, orig);
00918 return str;
00919 }
00920
00921 static inline long
00922 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00923 {
00924 long c;
00925 const char *q;
00926
00927 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00928 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00929 }
00930 else if (rb_enc_asciicompat(enc)) {
00931 c = 0;
00932 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00933 while (p < e) {
00934 if (ISASCII(*p)) {
00935 q = search_nonascii(p, e);
00936 if (!q)
00937 return c + (e - p);
00938 c += q - p;
00939 p = q;
00940 }
00941 p += rb_enc_fast_mbclen(p, e, enc);
00942 c++;
00943 }
00944 }
00945 else {
00946 while (p < e) {
00947 if (ISASCII(*p)) {
00948 q = search_nonascii(p, e);
00949 if (!q)
00950 return c + (e - p);
00951 c += q - p;
00952 p = q;
00953 }
00954 p += rb_enc_mbclen(p, e, enc);
00955 c++;
00956 }
00957 }
00958 return c;
00959 }
00960
00961 for (c=0; p<e; c++) {
00962 p += rb_enc_mbclen(p, e, enc);
00963 }
00964 return c;
00965 }
00966
00967 long
00968 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00969 {
00970 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00971 }
00972
00973 long
00974 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00975 {
00976 long c;
00977 const char *q;
00978 int ret;
00979
00980 *cr = 0;
00981 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00982 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00983 }
00984 else if (rb_enc_asciicompat(enc)) {
00985 c = 0;
00986 while (p < e) {
00987 if (ISASCII(*p)) {
00988 q = search_nonascii(p, e);
00989 if (!q) {
00990 if (!*cr) *cr = ENC_CODERANGE_7BIT;
00991 return c + (e - p);
00992 }
00993 c += q - p;
00994 p = q;
00995 }
00996 ret = rb_enc_precise_mbclen(p, e, enc);
00997 if (MBCLEN_CHARFOUND_P(ret)) {
00998 *cr |= ENC_CODERANGE_VALID;
00999 p += MBCLEN_CHARFOUND_LEN(ret);
01000 }
01001 else {
01002 *cr = ENC_CODERANGE_BROKEN;
01003 p++;
01004 }
01005 c++;
01006 }
01007 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01008 return c;
01009 }
01010
01011 for (c=0; p<e; c++) {
01012 ret = rb_enc_precise_mbclen(p, e, enc);
01013 if (MBCLEN_CHARFOUND_P(ret)) {
01014 *cr |= ENC_CODERANGE_VALID;
01015 p += MBCLEN_CHARFOUND_LEN(ret);
01016 }
01017 else {
01018 *cr = ENC_CODERANGE_BROKEN;
01019 if (p + rb_enc_mbminlen(enc) <= e)
01020 p += rb_enc_mbminlen(enc);
01021 else
01022 p = e;
01023 }
01024 }
01025 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01026 return c;
01027 }
01028
01029 #ifdef NONASCII_MASK
01030 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01031 static inline VALUE
01032 count_utf8_lead_bytes_with_word(const VALUE *s)
01033 {
01034 VALUE d = *s;
01035 d |= ~(d>>1);
01036 d >>= 6;
01037 d &= NONASCII_MASK >> 7;
01038 d += (d>>8);
01039 d += (d>>16);
01040 #if SIZEOF_VALUE == 8
01041 d += (d>>32);
01042 #endif
01043 return (d&0xF);
01044 }
01045 #endif
01046
01047 static long
01048 str_strlen(VALUE str, rb_encoding *enc)
01049 {
01050 const char *p, *e;
01051 long n;
01052 int cr;
01053
01054 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01055 if (!enc) enc = STR_ENC_GET(str);
01056 p = RSTRING_PTR(str);
01057 e = RSTRING_END(str);
01058 cr = ENC_CODERANGE(str);
01059 #ifdef NONASCII_MASK
01060 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01061 enc == rb_utf8_encoding()) {
01062
01063 VALUE len = 0;
01064 if ((int)sizeof(VALUE) * 2 < e - p) {
01065 const VALUE *s, *t;
01066 const VALUE lowbits = sizeof(VALUE) - 1;
01067 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01068 t = (const VALUE*)(~lowbits & (VALUE)e);
01069 while (p < (const char *)s) {
01070 if (is_utf8_lead_byte(*p)) len++;
01071 p++;
01072 }
01073 while (s < t) {
01074 len += count_utf8_lead_bytes_with_word(s);
01075 s++;
01076 }
01077 p = (const char *)s;
01078 }
01079 while (p < e) {
01080 if (is_utf8_lead_byte(*p)) len++;
01081 p++;
01082 }
01083 return (long)len;
01084 }
01085 #endif
01086 n = rb_enc_strlen_cr(p, e, enc, &cr);
01087 if (cr) {
01088 ENC_CODERANGE_SET(str, cr);
01089 }
01090 return n;
01091 }
01092
01093 long
01094 rb_str_strlen(VALUE str)
01095 {
01096 return str_strlen(str, STR_ENC_GET(str));
01097 }
01098
01099
01100
01101
01102
01103
01104
01105
01106
01107 VALUE
01108 rb_str_length(VALUE str)
01109 {
01110 long len;
01111
01112 len = str_strlen(str, STR_ENC_GET(str));
01113 return LONG2NUM(len);
01114 }
01115
01116
01117
01118
01119
01120
01121
01122
01123 static VALUE
01124 rb_str_bytesize(VALUE str)
01125 {
01126 return INT2NUM(RSTRING_LEN(str));
01127 }
01128
01129
01130
01131
01132
01133
01134
01135
01136
01137
01138
01139 static VALUE
01140 rb_str_empty(VALUE str)
01141 {
01142 if (RSTRING_LEN(str) == 0)
01143 return Qtrue;
01144 return Qfalse;
01145 }
01146
01147
01148
01149
01150
01151
01152
01153
01154
01155
01156
01157 VALUE
01158 rb_str_plus(VALUE str1, VALUE str2)
01159 {
01160 VALUE str3;
01161 rb_encoding *enc;
01162
01163 StringValue(str2);
01164 enc = rb_enc_check(str1, str2);
01165 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01166 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01167 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01168 RSTRING_PTR(str2), RSTRING_LEN(str2));
01169 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01170
01171 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01172 OBJ_TAINT(str3);
01173 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01174 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01175 return str3;
01176 }
01177
01178
01179
01180
01181
01182
01183
01184
01185
01186
01187
01188 VALUE
01189 rb_str_times(VALUE str, VALUE times)
01190 {
01191 VALUE str2;
01192 long n, len;
01193 char *ptr2;
01194
01195 len = NUM2LONG(times);
01196 if (len < 0) {
01197 rb_raise(rb_eArgError, "negative argument");
01198 }
01199 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
01200 rb_raise(rb_eArgError, "argument too big");
01201 }
01202
01203 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01204 ptr2 = RSTRING_PTR(str2);
01205 if (len) {
01206 n = RSTRING_LEN(str);
01207 memcpy(ptr2, RSTRING_PTR(str), n);
01208 while (n <= len/2) {
01209 memcpy(ptr2 + n, ptr2, n);
01210 n *= 2;
01211 }
01212 memcpy(ptr2 + n, ptr2, len-n);
01213 }
01214 ptr2[RSTRING_LEN(str2)] = '\0';
01215 OBJ_INFECT(str2, str);
01216 rb_enc_cr_str_copy_for_substr(str2, str);
01217
01218 return str2;
01219 }
01220
01221
01222
01223
01224
01225
01226
01227
01228
01229
01230
01231
01232
01233
01234
01235
01236 static VALUE
01237 rb_str_format_m(VALUE str, VALUE arg)
01238 {
01239 volatile VALUE tmp = rb_check_array_type(arg);
01240
01241 if (!NIL_P(tmp)) {
01242 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01243 }
01244 return rb_str_format(1, &arg, str);
01245 }
01246
01247 static inline void
01248 str_modifiable(VALUE str)
01249 {
01250 if (FL_TEST(str, STR_TMPLOCK)) {
01251 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01252 }
01253 if (OBJ_FROZEN(str)) rb_error_frozen("string");
01254 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01255 rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01256 }
01257
01258 static inline int
01259 str_independent(VALUE str)
01260 {
01261 str_modifiable(str);
01262 if (!STR_SHARED_P(str)) return 1;
01263 if (STR_EMBED_P(str)) return 1;
01264 return 0;
01265 }
01266
01267 static void
01268 str_make_independent(VALUE str)
01269 {
01270 char *ptr;
01271 long len = RSTRING_LEN(str);
01272
01273 ptr = ALLOC_N(char, len+1);
01274 if (RSTRING_PTR(str)) {
01275 memcpy(ptr, RSTRING_PTR(str), len);
01276 }
01277 STR_SET_NOEMBED(str);
01278 ptr[len] = 0;
01279 RSTRING(str)->as.heap.ptr = ptr;
01280 RSTRING(str)->as.heap.len = len;
01281 RSTRING(str)->as.heap.aux.capa = len;
01282 STR_UNSET_NOCAPA(str);
01283 }
01284
01285 void
01286 rb_str_modify(VALUE str)
01287 {
01288 if (!str_independent(str))
01289 str_make_independent(str);
01290 ENC_CODERANGE_CLEAR(str);
01291 }
01292
01293
01294 static void
01295 str_modify_keep_cr(VALUE str)
01296 {
01297 if (!str_independent(str))
01298 str_make_independent(str);
01299 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01300
01301 ENC_CODERANGE_CLEAR(str);
01302 }
01303
01304 static inline void
01305 str_discard(VALUE str)
01306 {
01307 str_modifiable(str);
01308 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01309 xfree(RSTRING_PTR(str));
01310 RSTRING(str)->as.heap.ptr = 0;
01311 RSTRING(str)->as.heap.len = 0;
01312 }
01313 }
01314
01315 void
01316 rb_str_associate(VALUE str, VALUE add)
01317 {
01318
01319 if (OBJ_FROZEN(str)) rb_error_frozen("string");
01320 if (STR_ASSOC_P(str)) {
01321
01322 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01323 }
01324 else {
01325 if (STR_SHARED_P(str)) {
01326 VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01327 str_make_independent(str);
01328 if (STR_ASSOC_P(assoc)) {
01329 assoc = RSTRING(assoc)->as.heap.aux.shared;
01330 rb_ary_concat(assoc, add);
01331 add = assoc;
01332 }
01333 }
01334 else if (STR_EMBED_P(str)) {
01335 str_make_independent(str);
01336 }
01337 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01338 RESIZE_CAPA(str, RSTRING_LEN(str));
01339 }
01340 FL_SET(str, STR_ASSOC);
01341 RBASIC(add)->klass = 0;
01342 RSTRING(str)->as.heap.aux.shared = add;
01343 }
01344 }
01345
01346 VALUE
01347 rb_str_associated(VALUE str)
01348 {
01349 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01350 if (STR_ASSOC_P(str)) {
01351 return RSTRING(str)->as.heap.aux.shared;
01352 }
01353 return Qfalse;
01354 }
01355
01356 VALUE
01357 rb_string_value(volatile VALUE *ptr)
01358 {
01359 VALUE s = *ptr;
01360 if (TYPE(s) != T_STRING) {
01361 s = rb_str_to_str(s);
01362 *ptr = s;
01363 }
01364 return s;
01365 }
01366
01367 char *
01368 rb_string_value_ptr(volatile VALUE *ptr)
01369 {
01370 VALUE str = rb_string_value(ptr);
01371 return RSTRING_PTR(str);
01372 }
01373
01374 char *
01375 rb_string_value_cstr(volatile VALUE *ptr)
01376 {
01377 VALUE str = rb_string_value(ptr);
01378 char *s = RSTRING_PTR(str);
01379 long len = RSTRING_LEN(str);
01380
01381 if (!s || memchr(s, 0, len)) {
01382 rb_raise(rb_eArgError, "string contains null byte");
01383 }
01384 if (s[len]) {
01385 rb_str_modify(str);
01386 s = RSTRING_PTR(str);
01387 s[RSTRING_LEN(str)] = 0;
01388 }
01389 return s;
01390 }
01391
01392 VALUE
01393 rb_check_string_type(VALUE str)
01394 {
01395 str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01396 return str;
01397 }
01398
01399
01400
01401
01402
01403
01404
01405
01406
01407
01408
01409
01410 static VALUE
01411 rb_str_s_try_convert(VALUE dummy, VALUE str)
01412 {
01413 return rb_check_string_type(str);
01414 }
01415
01416 char*
01417 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01418 {
01419 if (rb_enc_mbmaxlen(enc) == 1) {
01420 p += nth;
01421 }
01422 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01423 p += nth * rb_enc_mbmaxlen(enc);
01424 }
01425 else if (rb_enc_asciicompat(enc)) {
01426 const char *p2, *e2;
01427 int n;
01428
01429 while (p < e && 0 < nth) {
01430 e2 = p + nth;
01431 if (e < e2)
01432 return (char *)e;
01433 if (ISASCII(*p)) {
01434 p2 = search_nonascii(p, e2);
01435 if (!p2)
01436 return (char *)e2;
01437 nth -= p2 - p;
01438 p = p2;
01439 }
01440 n = rb_enc_mbclen(p, e, enc);
01441 p += n;
01442 nth--;
01443 }
01444 if (nth != 0)
01445 return (char *)e;
01446 return (char *)p;
01447 }
01448 else {
01449 while (p<e && nth--) {
01450 p += rb_enc_mbclen(p, e, enc);
01451 }
01452 }
01453 if (p > e) p = e;
01454 return (char*)p;
01455 }
01456
01457 static char*
01458 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01459 {
01460 if (singlebyte)
01461 p += nth;
01462 else {
01463 p = rb_enc_nth(p, e, nth, enc);
01464 }
01465 if (!p) return 0;
01466 if (p > e) p = e;
01467 return (char *)p;
01468 }
01469
01470
01471 static long
01472 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01473 {
01474 const char *pp = str_nth(p, e, nth, enc, singlebyte);
01475 if (!pp) return e - p;
01476 return pp - p;
01477 }
01478
01479 long
01480 rb_str_offset(VALUE str, long pos)
01481 {
01482 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01483 STR_ENC_GET(str), single_byte_optimizable(str));
01484 }
01485
01486 #ifdef NONASCII_MASK
01487 static char *
01488 str_utf8_nth(const char *p, const char *e, long nth)
01489 {
01490 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01491 const VALUE *s, *t;
01492 const VALUE lowbits = sizeof(VALUE) - 1;
01493 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01494 t = (const VALUE*)(~lowbits & (VALUE)e);
01495 while (p < (const char *)s) {
01496 if (is_utf8_lead_byte(*p)) nth--;
01497 p++;
01498 }
01499 do {
01500 nth -= count_utf8_lead_bytes_with_word(s);
01501 s++;
01502 } while (s < t && (int)sizeof(VALUE) <= nth);
01503 p = (char *)s;
01504 }
01505 while (p < e) {
01506 if (is_utf8_lead_byte(*p)) {
01507 if (nth == 0) break;
01508 nth--;
01509 }
01510 p++;
01511 }
01512 return (char *)p;
01513 }
01514
01515 static long
01516 str_utf8_offset(const char *p, const char *e, long nth)
01517 {
01518 const char *pp = str_utf8_nth(p, e, nth);
01519 return pp - p;
01520 }
01521 #endif
01522
01523
01524 long
01525 rb_str_sublen(VALUE str, long pos)
01526 {
01527 if (single_byte_optimizable(str) || pos < 0)
01528 return pos;
01529 else {
01530 char *p = RSTRING_PTR(str);
01531 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01532 }
01533 }
01534
01535 VALUE
01536 rb_str_subseq(VALUE str, long beg, long len)
01537 {
01538 VALUE str2;
01539
01540 if (RSTRING_LEN(str) == beg + len &&
01541 RSTRING_EMBED_LEN_MAX < len) {
01542 str2 = rb_str_new_shared(rb_str_new_frozen(str));
01543 rb_str_drop_bytes(str2, beg);
01544 }
01545 else {
01546 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01547 }
01548
01549 rb_enc_cr_str_copy_for_substr(str2, str);
01550 OBJ_INFECT(str2, str);
01551
01552 return str2;
01553 }
01554
01555 VALUE
01556 rb_str_substr(VALUE str, long beg, long len)
01557 {
01558 rb_encoding *enc = STR_ENC_GET(str);
01559 VALUE str2;
01560 char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01561
01562 if (len < 0) return Qnil;
01563 if (!RSTRING_LEN(str)) {
01564 len = 0;
01565 }
01566 if (single_byte_optimizable(str)) {
01567 if (beg > RSTRING_LEN(str)) return Qnil;
01568 if (beg < 0) {
01569 beg += RSTRING_LEN(str);
01570 if (beg < 0) return Qnil;
01571 }
01572 if (beg + len > RSTRING_LEN(str))
01573 len = RSTRING_LEN(str) - beg;
01574 if (len <= 0) {
01575 len = 0;
01576 p = 0;
01577 }
01578 else
01579 p = s + beg;
01580 goto sub;
01581 }
01582 if (beg < 0) {
01583 if (len > -beg) len = -beg;
01584 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01585 beg = -beg;
01586 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01587 p = e;
01588 if (!p) return Qnil;
01589 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01590 if (!p) return Qnil;
01591 len = e - p;
01592 goto sub;
01593 }
01594 else {
01595 beg += str_strlen(str, enc);
01596 if (beg < 0) return Qnil;
01597 }
01598 }
01599 else if (beg > 0 && beg > str_strlen(str, enc)) {
01600 return Qnil;
01601 }
01602 if (len == 0) {
01603 p = 0;
01604 }
01605 #ifdef NONASCII_MASK
01606 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01607 enc == rb_utf8_encoding()) {
01608 p = str_utf8_nth(s, e, beg);
01609 len = str_utf8_offset(p, e, len);
01610 }
01611 #endif
01612 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01613 int char_sz = rb_enc_mbmaxlen(enc);
01614
01615 p = s + beg * char_sz;
01616 if (p > e) {
01617 p = e;
01618 len = 0;
01619 }
01620 else if (len * char_sz > e - p)
01621 len = e - p;
01622 else
01623 len *= char_sz;
01624 }
01625 else if ((p = str_nth(s, e, beg, enc, 0)) == e) {
01626 len = 0;
01627 }
01628 else {
01629 len = str_offset(p, e, len, enc, 0);
01630 }
01631 sub:
01632 if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01633 str2 = rb_str_new4(str);
01634 str2 = str_new3(rb_obj_class(str2), str2);
01635 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01636 RSTRING(str2)->as.heap.len = len;
01637 }
01638 else {
01639 str2 = rb_str_new5(str, p, len);
01640 rb_enc_cr_str_copy_for_substr(str2, str);
01641 OBJ_INFECT(str2, str);
01642 }
01643
01644 return str2;
01645 }
01646
01647 VALUE
01648 rb_str_freeze(VALUE str)
01649 {
01650 if (STR_ASSOC_P(str)) {
01651 VALUE ary = RSTRING(str)->as.heap.aux.shared;
01652 OBJ_FREEZE(ary);
01653 }
01654 return rb_obj_freeze(str);
01655 }
01656
01657 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01658 #define rb_str_dup_frozen rb_str_new_frozen
01659
01660 VALUE
01661 rb_str_locktmp(VALUE str)
01662 {
01663 if (FL_TEST(str, STR_TMPLOCK)) {
01664 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01665 }
01666 FL_SET(str, STR_TMPLOCK);
01667 return str;
01668 }
01669
01670 VALUE
01671 rb_str_unlocktmp(VALUE str)
01672 {
01673 if (!FL_TEST(str, STR_TMPLOCK)) {
01674 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01675 }
01676 FL_UNSET(str, STR_TMPLOCK);
01677 return str;
01678 }
01679
01680 void
01681 rb_str_set_len(VALUE str, long len)
01682 {
01683 rb_str_modify(str);
01684 STR_SET_LEN(str, len);
01685 RSTRING_PTR(str)[len] = '\0';
01686 }
01687
01688 VALUE
01689 rb_str_resize(VALUE str, long len)
01690 {
01691 long slen;
01692
01693 if (len < 0) {
01694 rb_raise(rb_eArgError, "negative string size (or size too big)");
01695 }
01696
01697 rb_str_modify(str);
01698 slen = RSTRING_LEN(str);
01699 if (len != slen) {
01700 if (STR_EMBED_P(str)) {
01701 char *ptr;
01702 if (len <= RSTRING_EMBED_LEN_MAX) {
01703 STR_SET_EMBED_LEN(str, len);
01704 RSTRING(str)->as.ary[len] = '\0';
01705 return str;
01706 }
01707 ptr = ALLOC_N(char,len+1);
01708 MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
01709 RSTRING(str)->as.heap.ptr = ptr;
01710 STR_SET_NOEMBED(str);
01711 }
01712 else if (len <= RSTRING_EMBED_LEN_MAX) {
01713 char *ptr = RSTRING(str)->as.heap.ptr;
01714 STR_SET_EMBED(str);
01715 if (slen > len) slen = len;
01716 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01717 RSTRING(str)->as.ary[len] = '\0';
01718 STR_SET_EMBED_LEN(str, len);
01719 xfree(ptr);
01720 return str;
01721 }
01722 else if (slen < len || slen - len > 1024) {
01723 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01724 }
01725 if (!STR_NOCAPA_P(str)) {
01726 RSTRING(str)->as.heap.aux.capa = len;
01727 }
01728 RSTRING(str)->as.heap.len = len;
01729 RSTRING(str)->as.heap.ptr[len] = '\0';
01730 }
01731 return str;
01732 }
01733
01734 static VALUE
01735 str_buf_cat(VALUE str, const char *ptr, long len)
01736 {
01737 long capa, total, off = -1;
01738
01739 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01740 off = ptr - RSTRING_PTR(str);
01741 }
01742 rb_str_modify(str);
01743 if (len == 0) return 0;
01744 if (STR_ASSOC_P(str)) {
01745 FL_UNSET(str, STR_ASSOC);
01746 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01747 }
01748 else if (STR_EMBED_P(str)) {
01749 capa = RSTRING_EMBED_LEN_MAX;
01750 }
01751 else {
01752 capa = RSTRING(str)->as.heap.aux.capa;
01753 }
01754 if (RSTRING_LEN(str) >= LONG_MAX - len) {
01755 rb_raise(rb_eArgError, "string sizes too big");
01756 }
01757 total = RSTRING_LEN(str)+len;
01758 if (capa <= total) {
01759 while (total > capa) {
01760 if (capa + 1 >= LONG_MAX / 2) {
01761 capa = (total + 4095) / 4096;
01762 break;
01763 }
01764 capa = (capa + 1) * 2;
01765 }
01766 RESIZE_CAPA(str, capa);
01767 }
01768 if (off != -1) {
01769 ptr = RSTRING_PTR(str) + off;
01770 }
01771 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01772 STR_SET_LEN(str, total);
01773 RSTRING_PTR(str)[total] = '\0';
01774
01775 return str;
01776 }
01777
01778 #define str_buf_cat2(str, ptr) str_buf_cat(str, (ptr), strlen(ptr))
01779
01780 VALUE
01781 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01782 {
01783 if (len == 0) return str;
01784 if (len < 0) {
01785 rb_raise(rb_eArgError, "negative string size (or size too big)");
01786 }
01787 return str_buf_cat(str, ptr, len);
01788 }
01789
01790 VALUE
01791 rb_str_buf_cat2(VALUE str, const char *ptr)
01792 {
01793 return rb_str_buf_cat(str, ptr, strlen(ptr));
01794 }
01795
01796 VALUE
01797 rb_str_cat(VALUE str, const char *ptr, long len)
01798 {
01799 if (len < 0) {
01800 rb_raise(rb_eArgError, "negative string size (or size too big)");
01801 }
01802 if (STR_ASSOC_P(str)) {
01803 rb_str_modify(str);
01804 if (STR_EMBED_P(str)) str_make_independent(str);
01805 REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
01806 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
01807 RSTRING(str)->as.heap.len += len;
01808 RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0';
01809 return str;
01810 }
01811
01812 return rb_str_buf_cat(str, ptr, len);
01813 }
01814
01815 VALUE
01816 rb_str_cat2(VALUE str, const char *ptr)
01817 {
01818 return rb_str_cat(str, ptr, strlen(ptr));
01819 }
01820
01821 static VALUE
01822 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01823 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01824 {
01825 int str_encindex = ENCODING_GET(str);
01826 int res_encindex;
01827 int str_cr, res_cr;
01828 int str_a8 = ENCODING_IS_ASCII8BIT(str);
01829 int ptr_a8 = ptr_encindex == 0;
01830
01831 str_cr = ENC_CODERANGE(str);
01832
01833 if (str_encindex == ptr_encindex) {
01834 if (str_cr == ENC_CODERANGE_UNKNOWN ||
01835 (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
01836 ptr_cr = ENC_CODERANGE_UNKNOWN;
01837 }
01838 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01839 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01840 }
01841 }
01842 else {
01843 rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01844 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01845 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01846 if (len == 0)
01847 return str;
01848 if (RSTRING_LEN(str) == 0) {
01849 rb_str_buf_cat(str, ptr, len);
01850 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01851 return str;
01852 }
01853 goto incompatible;
01854 }
01855 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01856 ptr_cr = coderange_scan(ptr, len, ptr_enc);
01857 }
01858 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01859 if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
01860 str_cr = rb_enc_str_coderange(str);
01861 }
01862 }
01863 }
01864 if (ptr_cr_ret)
01865 *ptr_cr_ret = ptr_cr;
01866
01867 if (str_encindex != ptr_encindex &&
01868 str_cr != ENC_CODERANGE_7BIT &&
01869 ptr_cr != ENC_CODERANGE_7BIT) {
01870 incompatible:
01871 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01872 rb_enc_name(rb_enc_from_index(str_encindex)),
01873 rb_enc_name(rb_enc_from_index(ptr_encindex)));
01874 }
01875
01876 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01877 res_encindex = str_encindex;
01878 res_cr = ENC_CODERANGE_UNKNOWN;
01879 }
01880 else if (str_cr == ENC_CODERANGE_7BIT) {
01881 if (ptr_cr == ENC_CODERANGE_7BIT) {
01882 res_encindex = !str_a8 ? str_encindex : ptr_encindex;
01883 res_cr = ENC_CODERANGE_7BIT;
01884 }
01885 else {
01886 res_encindex = ptr_encindex;
01887 res_cr = ptr_cr;
01888 }
01889 }
01890 else if (str_cr == ENC_CODERANGE_VALID) {
01891 res_encindex = str_encindex;
01892 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
01893 res_cr = str_cr;
01894 else
01895 res_cr = ptr_cr;
01896 }
01897 else {
01898 res_encindex = str_encindex;
01899 res_cr = str_cr;
01900 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01901 }
01902
01903 if (len < 0) {
01904 rb_raise(rb_eArgError, "negative string size (or size too big)");
01905 }
01906 str_buf_cat(str, ptr, len);
01907 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01908 return str;
01909 }
01910
01911 VALUE
01912 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
01913 {
01914 return rb_enc_cr_str_buf_cat(str, ptr, len,
01915 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
01916 }
01917
01918 VALUE
01919 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
01920 {
01921
01922 int encindex = ENCODING_GET(str);
01923 rb_encoding *enc = rb_enc_from_index(encindex);
01924 if (rb_enc_asciicompat(enc)) {
01925 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
01926 encindex, ENC_CODERANGE_7BIT, 0);
01927 }
01928 else {
01929 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
01930 while (*ptr) {
01931 unsigned int c = (unsigned char)*ptr;
01932 int len = rb_enc_codelen(c, enc);
01933 rb_enc_mbcput(c, buf, enc);
01934 rb_enc_cr_str_buf_cat(str, buf, len,
01935 encindex, ENC_CODERANGE_VALID, 0);
01936 ptr++;
01937 }
01938 return str;
01939 }
01940 }
01941
01942 VALUE
01943 rb_str_buf_append(VALUE str, VALUE str2)
01944 {
01945 int str2_cr;
01946
01947 str2_cr = ENC_CODERANGE(str2);
01948
01949 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
01950 ENCODING_GET(str2), str2_cr, &str2_cr);
01951
01952 OBJ_INFECT(str, str2);
01953 ENC_CODERANGE_SET(str2, str2_cr);
01954
01955 return str;
01956 }
01957
01958 VALUE
01959 rb_str_append(VALUE str, VALUE str2)
01960 {
01961 rb_encoding *enc;
01962 int cr, cr2;
01963
01964 StringValue(str2);
01965 if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
01966 long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
01967 enc = rb_enc_check(str, str2);
01968 cr = ENC_CODERANGE(str);
01969 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
01970 rb_str_modify(str);
01971 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01972 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
01973 RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
01974 RSTRING(str)->as.heap.len = len;
01975 rb_enc_associate(str, enc);
01976 ENC_CODERANGE_SET(str, cr);
01977 OBJ_INFECT(str, str2);
01978 return str;
01979 }
01980 return rb_str_buf_append(str, str2);
01981 }
01982
01983
01984
01985
01986
01987
01988
01989
01990
01991
01992
01993
01994
01995
01996
01997
01998
01999
02000 VALUE
02001 rb_str_concat(VALUE str1, VALUE str2)
02002 {
02003 unsigned int lc;
02004
02005 if (FIXNUM_P(str2)) {
02006 if ((int)str2 < 0)
02007 rb_raise(rb_eRangeError, "negative argument");
02008 lc = FIX2UINT(str2);
02009 }
02010 else if (TYPE(str2) == T_BIGNUM) {
02011 if (!RBIGNUM_SIGN(str2))
02012 rb_raise(rb_eRangeError, "negative argument");
02013 lc = NUM2UINT(str2);
02014 }
02015 else {
02016 return rb_str_append(str1, str2);
02017 }
02018 #if SIZEOF_INT < SIZEOF_VALUE
02019 if ((VALUE)lc > UINT_MAX) {
02020 rb_raise(rb_eRangeError, "%"PRIuVALUE" out of char range", lc);
02021 }
02022 #endif
02023 {
02024 rb_encoding *enc = STR_ENC_GET(str1);
02025 long pos = RSTRING_LEN(str1);
02026 int cr = ENC_CODERANGE(str1);
02027 int len;
02028
02029 if ((len = rb_enc_codelen(lc, enc)) <= 0) {
02030 rb_raise(rb_eRangeError, "%u invalid char", lc);
02031 }
02032 rb_str_resize(str1, pos+len);
02033 rb_enc_mbcput(lc, RSTRING_PTR(str1)+pos, enc);
02034 if (cr == ENC_CODERANGE_7BIT && lc > 127)
02035 cr = ENC_CODERANGE_VALID;
02036 ENC_CODERANGE_SET(str1, cr);
02037 return str1;
02038 }
02039 }
02040
02041 st_index_t
02042 rb_memhash(const void *ptr, long len)
02043 {
02044 return st_hash(ptr, len, rb_hash_start(0));
02045 }
02046
02047 st_index_t
02048 rb_str_hash(VALUE str)
02049 {
02050 int e = ENCODING_GET(str);
02051 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02052 e = 0;
02053 }
02054 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02055 }
02056
02057 int
02058 rb_str_hash_cmp(VALUE str1, VALUE str2)
02059 {
02060 long len;
02061
02062 if (!rb_str_comparable(str1, str2)) return 1;
02063 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02064 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02065 return 0;
02066 }
02067 return 1;
02068 }
02069
02070
02071
02072
02073
02074
02075
02076
02077 static VALUE
02078 rb_str_hash_m(VALUE str)
02079 {
02080 st_index_t hval = rb_str_hash(str);
02081 return INT2FIX(hval);
02082 }
02083
02084 #define lesser(a,b) (((a)>(b))?(b):(a))
02085
02086 int
02087 rb_str_comparable(VALUE str1, VALUE str2)
02088 {
02089 int idx1, idx2;
02090 int rc1, rc2;
02091
02092 if (RSTRING_LEN(str1) == 0) return TRUE;
02093 if (RSTRING_LEN(str2) == 0) return TRUE;
02094 idx1 = ENCODING_GET(str1);
02095 idx2 = ENCODING_GET(str2);
02096 if (idx1 == idx2) return TRUE;
02097 rc1 = rb_enc_str_coderange(str1);
02098 rc2 = rb_enc_str_coderange(str2);
02099 if (rc1 == ENC_CODERANGE_7BIT) {
02100 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02101 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02102 return TRUE;
02103 }
02104 if (rc2 == ENC_CODERANGE_7BIT) {
02105 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02106 return TRUE;
02107 }
02108 return FALSE;
02109 }
02110
02111 int
02112 rb_str_cmp(VALUE str1, VALUE str2)
02113 {
02114 long len;
02115 int retval;
02116
02117 len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
02118 retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
02119 if (retval == 0) {
02120 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
02121 if (!rb_str_comparable(str1, str2)) {
02122 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02123 return 1;
02124 return -1;
02125 }
02126 return 0;
02127 }
02128 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
02129 return -1;
02130 }
02131 if (retval > 0) return 1;
02132 return -1;
02133 }
02134
02135
02136 static VALUE
02137 str_eql(const VALUE str1, const VALUE str2)
02138 {
02139 const long len = RSTRING_LEN(str1);
02140
02141 if (len != RSTRING_LEN(str2)) return Qfalse;
02142 if (!rb_str_comparable(str1, str2)) return Qfalse;
02143 if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0)
02144 return Qtrue;
02145 return Qfalse;
02146 }
02147
02148
02149
02150
02151
02152
02153
02154
02155
02156 VALUE
02157 rb_str_equal(VALUE str1, VALUE str2)
02158 {
02159 if (str1 == str2) return Qtrue;
02160 if (TYPE(str2) != T_STRING) {
02161 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02162 return Qfalse;
02163 }
02164 return rb_equal(str2, str1);
02165 }
02166 return str_eql(str1, str2);
02167 }
02168
02169
02170
02171
02172
02173
02174
02175
02176 static VALUE
02177 rb_str_eql(VALUE str1, VALUE str2)
02178 {
02179 if (TYPE(str2) != T_STRING) return Qfalse;
02180 return str_eql(str1, str2);
02181 }
02182
02183
02184
02185
02186
02187
02188
02189
02190
02191
02192
02193
02194
02195
02196
02197
02198
02199
02200
02201
02202
02203
02204
02205
02206 static VALUE
02207 rb_str_cmp_m(VALUE str1, VALUE str2)
02208 {
02209 long result;
02210
02211 if (TYPE(str2) != T_STRING) {
02212 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02213 return Qnil;
02214 }
02215 else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02216 return Qnil;
02217 }
02218 else {
02219 VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02220
02221 if (NIL_P(tmp)) return Qnil;
02222 if (!FIXNUM_P(tmp)) {
02223 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02224 }
02225 result = -FIX2LONG(tmp);
02226 }
02227 }
02228 else {
02229 result = rb_str_cmp(str1, str2);
02230 }
02231 return LONG2NUM(result);
02232 }
02233
02234
02235
02236
02237
02238
02239
02240
02241
02242
02243
02244
02245
02246 static VALUE
02247 rb_str_casecmp(VALUE str1, VALUE str2)
02248 {
02249 long len;
02250 rb_encoding *enc;
02251 char *p1, *p1end, *p2, *p2end;
02252
02253 StringValue(str2);
02254 enc = rb_enc_compatible(str1, str2);
02255 if (!enc) {
02256 return Qnil;
02257 }
02258
02259 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02260 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02261 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02262 while (p1 < p1end && p2 < p2end) {
02263 if (*p1 != *p2) {
02264 unsigned int c1 = TOUPPER(*p1 & 0xff);
02265 unsigned int c2 = TOUPPER(*p2 & 0xff);
02266 if (c1 != c2)
02267 return INT2FIX(c1 < c2 ? -1 : 1);
02268 }
02269 p1++;
02270 p2++;
02271 }
02272 }
02273 else {
02274 while (p1 < p1end && p2 < p2end) {
02275 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02276 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02277
02278 if (0 <= c1 && 0 <= c2) {
02279 c1 = TOUPPER(c1);
02280 c2 = TOUPPER(c2);
02281 if (c1 != c2)
02282 return INT2FIX(c1 < c2 ? -1 : 1);
02283 }
02284 else {
02285 int r;
02286 l1 = rb_enc_mbclen(p1, p1end, enc);
02287 l2 = rb_enc_mbclen(p2, p2end, enc);
02288 len = l1 < l2 ? l1 : l2;
02289 r = memcmp(p1, p2, len);
02290 if (r != 0)
02291 return INT2FIX(r < 0 ? -1 : 1);
02292 if (l1 != l2)
02293 return INT2FIX(l1 < l2 ? -1 : 1);
02294 }
02295 p1 += l1;
02296 p2 += l2;
02297 }
02298 }
02299 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02300 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02301 return INT2FIX(-1);
02302 }
02303
02304 static long
02305 rb_str_index(VALUE str, VALUE sub, long offset)
02306 {
02307 long pos;
02308 char *s, *sptr, *e;
02309 long len, slen;
02310 rb_encoding *enc;
02311
02312 enc = rb_enc_check(str, sub);
02313 if (is_broken_string(sub)) {
02314 return -1;
02315 }
02316 len = str_strlen(str, enc);
02317 slen = str_strlen(sub, enc);
02318 if (offset < 0) {
02319 offset += len;
02320 if (offset < 0) return -1;
02321 }
02322 if (len - offset < slen) return -1;
02323 s = RSTRING_PTR(str);
02324 e = s + RSTRING_LEN(str);
02325 if (offset) {
02326 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02327 s += offset;
02328 }
02329 if (slen == 0) return offset;
02330
02331 sptr = RSTRING_PTR(sub);
02332 slen = RSTRING_LEN(sub);
02333 len = RSTRING_LEN(str) - offset;
02334 for (;;) {
02335 char *t;
02336 pos = rb_memsearch(sptr, slen, s, len, enc);
02337 if (pos < 0) return pos;
02338 t = rb_enc_right_char_head(s, s+pos, e, enc);
02339 if (t == s + pos) break;
02340 if ((len -= t - s) <= 0) return -1;
02341 offset += t - s;
02342 s = t;
02343 }
02344 return pos + offset;
02345 }
02346
02347
02348
02349
02350
02351
02352
02353
02354
02355
02356
02357
02358
02359
02360
02361
02362
02363
02364
02365 static VALUE
02366 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02367 {
02368 VALUE sub;
02369 VALUE initpos;
02370 long pos;
02371
02372 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02373 pos = NUM2LONG(initpos);
02374 }
02375 else {
02376 pos = 0;
02377 }
02378 if (pos < 0) {
02379 pos += str_strlen(str, STR_ENC_GET(str));
02380 if (pos < 0) {
02381 if (TYPE(sub) == T_REGEXP) {
02382 rb_backref_set(Qnil);
02383 }
02384 return Qnil;
02385 }
02386 }
02387
02388 switch (TYPE(sub)) {
02389 case T_REGEXP:
02390 if (pos > str_strlen(str, STR_ENC_GET(str)))
02391 return Qnil;
02392 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02393 rb_enc_check(str, sub), single_byte_optimizable(str));
02394
02395 pos = rb_reg_search(sub, str, pos, 0);
02396 pos = rb_str_sublen(str, pos);
02397 break;
02398
02399 default: {
02400 VALUE tmp;
02401
02402 tmp = rb_check_string_type(sub);
02403 if (NIL_P(tmp)) {
02404 rb_raise(rb_eTypeError, "type mismatch: %s given",
02405 rb_obj_classname(sub));
02406 }
02407 sub = tmp;
02408 }
02409
02410 case T_STRING:
02411 pos = rb_str_index(str, sub, pos);
02412 pos = rb_str_sublen(str, pos);
02413 break;
02414 }
02415
02416 if (pos == -1) return Qnil;
02417 return LONG2NUM(pos);
02418 }
02419
02420 static long
02421 rb_str_rindex(VALUE str, VALUE sub, long pos)
02422 {
02423 long len, slen;
02424 char *s, *sbeg, *e, *t;
02425 rb_encoding *enc;
02426 int singlebyte = single_byte_optimizable(str);
02427
02428 enc = rb_enc_check(str, sub);
02429 if (is_broken_string(sub)) {
02430 return -1;
02431 }
02432 len = str_strlen(str, enc);
02433 slen = str_strlen(sub, enc);
02434
02435 if (len < slen) return -1;
02436 if (len - pos < slen) {
02437 pos = len - slen;
02438 }
02439 if (len == 0) {
02440 return pos;
02441 }
02442 sbeg = RSTRING_PTR(str);
02443 e = RSTRING_END(str);
02444 t = RSTRING_PTR(sub);
02445 slen = RSTRING_LEN(sub);
02446 for (;;) {
02447 s = str_nth(sbeg, e, pos, enc, singlebyte);
02448 if (!s) return -1;
02449 if (memcmp(s, t, slen) == 0) {
02450 return pos;
02451 }
02452 if (pos == 0) break;
02453 pos--;
02454 }
02455 return -1;
02456 }
02457
02458
02459
02460
02461
02462
02463
02464
02465
02466
02467
02468
02469
02470
02471
02472
02473
02474
02475
02476
02477 static VALUE
02478 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02479 {
02480 VALUE sub;
02481 VALUE vpos;
02482 rb_encoding *enc = STR_ENC_GET(str);
02483 long pos, len = str_strlen(str, enc);
02484
02485 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02486 pos = NUM2LONG(vpos);
02487 if (pos < 0) {
02488 pos += len;
02489 if (pos < 0) {
02490 if (TYPE(sub) == T_REGEXP) {
02491 rb_backref_set(Qnil);
02492 }
02493 return Qnil;
02494 }
02495 }
02496 if (pos > len) pos = len;
02497 }
02498 else {
02499 pos = len;
02500 }
02501
02502 switch (TYPE(sub)) {
02503 case T_REGEXP:
02504
02505 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02506 STR_ENC_GET(str), single_byte_optimizable(str));
02507
02508 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02509 pos = rb_reg_search(sub, str, pos, 1);
02510 pos = rb_str_sublen(str, pos);
02511 }
02512 if (pos >= 0) return LONG2NUM(pos);
02513 break;
02514
02515 default: {
02516 VALUE tmp;
02517
02518 tmp = rb_check_string_type(sub);
02519 if (NIL_P(tmp)) {
02520 rb_raise(rb_eTypeError, "type mismatch: %s given",
02521 rb_obj_classname(sub));
02522 }
02523 sub = tmp;
02524 }
02525
02526 case T_STRING:
02527 pos = rb_str_rindex(str, sub, pos);
02528 if (pos >= 0) return LONG2NUM(pos);
02529 break;
02530 }
02531 return Qnil;
02532 }
02533
02534
02535
02536
02537
02538
02539
02540
02541
02542
02543
02544
02545
02546
02547
02548 static VALUE
02549 rb_str_match(VALUE x, VALUE y)
02550 {
02551 switch (TYPE(y)) {
02552 case T_STRING:
02553 rb_raise(rb_eTypeError, "type mismatch: String given");
02554
02555 case T_REGEXP:
02556 return rb_reg_match(y, x);
02557
02558 default:
02559 return rb_funcall(y, rb_intern("=~"), 1, x);
02560 }
02561 }
02562
02563
02564 static VALUE get_pat(VALUE, int);
02565
02566
02567
02568
02569
02570
02571
02572
02573
02574
02575
02576
02577
02578
02579
02580
02581
02582
02583
02584
02585
02586
02587
02588
02589
02590
02591
02592
02593
02594
02595
02596
02597
02598 static VALUE
02599 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02600 {
02601 VALUE re, result;
02602 if (argc < 1)
02603 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02604 re = argv[0];
02605 argv[0] = str;
02606 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02607 if (!NIL_P(result) && rb_block_given_p()) {
02608 return rb_yield(result);
02609 }
02610 return result;
02611 }
02612
02613 enum neighbor_char {
02614 NEIGHBOR_NOT_CHAR,
02615 NEIGHBOR_FOUND,
02616 NEIGHBOR_WRAPPED
02617 };
02618
02619 static enum neighbor_char
02620 enc_succ_char(char *p, long len, rb_encoding *enc)
02621 {
02622 long i;
02623 int l;
02624 while (1) {
02625 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02626 p[i] = '\0';
02627 if (i < 0)
02628 return NEIGHBOR_WRAPPED;
02629 ++((unsigned char*)p)[i];
02630 l = rb_enc_precise_mbclen(p, p+len, enc);
02631 if (MBCLEN_CHARFOUND_P(l)) {
02632 l = MBCLEN_CHARFOUND_LEN(l);
02633 if (l == len) {
02634 return NEIGHBOR_FOUND;
02635 }
02636 else {
02637 memset(p+l, 0xff, len-l);
02638 }
02639 }
02640 if (MBCLEN_INVALID_P(l) && i < len-1) {
02641 long len2;
02642 int l2;
02643 for (len2 = len-1; 0 < len2; len2--) {
02644 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02645 if (!MBCLEN_INVALID_P(l2))
02646 break;
02647 }
02648 memset(p+len2+1, 0xff, len-(len2+1));
02649 }
02650 }
02651 }
02652
02653 static enum neighbor_char
02654 enc_pred_char(char *p, long len, rb_encoding *enc)
02655 {
02656 long i;
02657 int l;
02658 while (1) {
02659 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02660 p[i] = '\xff';
02661 if (i < 0)
02662 return NEIGHBOR_WRAPPED;
02663 --((unsigned char*)p)[i];
02664 l = rb_enc_precise_mbclen(p, p+len, enc);
02665 if (MBCLEN_CHARFOUND_P(l)) {
02666 l = MBCLEN_CHARFOUND_LEN(l);
02667 if (l == len) {
02668 return NEIGHBOR_FOUND;
02669 }
02670 else {
02671 memset(p+l, 0, len-l);
02672 }
02673 }
02674 if (MBCLEN_INVALID_P(l) && i < len-1) {
02675 long len2;
02676 int l2;
02677 for (len2 = len-1; 0 < len2; len2--) {
02678 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02679 if (!MBCLEN_INVALID_P(l2))
02680 break;
02681 }
02682 memset(p+len2+1, 0, len-(len2+1));
02683 }
02684 }
02685 }
02686
02687
02688
02689
02690
02691
02692
02693
02694
02695
02696 static enum neighbor_char
02697 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02698 {
02699 enum neighbor_char ret;
02700 unsigned int c;
02701 int ctype;
02702 int range;
02703 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02704
02705 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02706 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02707 ctype = ONIGENC_CTYPE_DIGIT;
02708 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02709 ctype = ONIGENC_CTYPE_ALPHA;
02710 else
02711 return NEIGHBOR_NOT_CHAR;
02712
02713 MEMCPY(save, p, char, len);
02714 ret = enc_succ_char(p, len, enc);
02715 if (ret == NEIGHBOR_FOUND) {
02716 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02717 if (rb_enc_isctype(c, ctype, enc))
02718 return NEIGHBOR_FOUND;
02719 }
02720 MEMCPY(p, save, char, len);
02721 range = 1;
02722 while (1) {
02723 MEMCPY(save, p, char, len);
02724 ret = enc_pred_char(p, len, enc);
02725 if (ret == NEIGHBOR_FOUND) {
02726 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02727 if (!rb_enc_isctype(c, ctype, enc)) {
02728 MEMCPY(p, save, char, len);
02729 break;
02730 }
02731 }
02732 else {
02733 MEMCPY(p, save, char, len);
02734 break;
02735 }
02736 range++;
02737 }
02738 if (range == 1) {
02739 return NEIGHBOR_NOT_CHAR;
02740 }
02741
02742 if (ctype != ONIGENC_CTYPE_DIGIT) {
02743 MEMCPY(carry, p, char, len);
02744 return NEIGHBOR_WRAPPED;
02745 }
02746
02747 MEMCPY(carry, p, char, len);
02748 enc_succ_char(carry, len, enc);
02749 return NEIGHBOR_WRAPPED;
02750 }
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760
02761
02762
02763
02764
02765
02766
02767
02768
02769
02770
02771
02772
02773
02774
02775
02776
02777
02778 VALUE
02779 rb_str_succ(VALUE orig)
02780 {
02781 rb_encoding *enc;
02782 VALUE str;
02783 char *sbeg, *s, *e, *last_alnum = 0;
02784 int c = -1;
02785 long l;
02786 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02787 long carry_pos = 0, carry_len = 1;
02788 enum neighbor_char neighbor = NEIGHBOR_FOUND;
02789
02790 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02791 rb_enc_cr_str_copy_for_substr(str, orig);
02792 OBJ_INFECT(str, orig);
02793 if (RSTRING_LEN(str) == 0) return str;
02794
02795 enc = STR_ENC_GET(orig);
02796 sbeg = RSTRING_PTR(str);
02797 s = e = sbeg + RSTRING_LEN(str);
02798
02799 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02800 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02801 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02802 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02803 s = last_alnum;
02804 break;
02805 }
02806 }
02807 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02808 neighbor = enc_succ_alnum_char(s, l, enc, carry);
02809 switch (neighbor) {
02810 case NEIGHBOR_NOT_CHAR:
02811 continue;
02812 case NEIGHBOR_FOUND:
02813 return str;
02814 case NEIGHBOR_WRAPPED:
02815 last_alnum = s;
02816 break;
02817 }
02818 c = 1;
02819 carry_pos = s - sbeg;
02820 carry_len = l;
02821 }
02822 if (c == -1) {
02823 s = e;
02824 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02825 enum neighbor_char neighbor;
02826 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02827 neighbor = enc_succ_char(s, l, enc);
02828 if (neighbor == NEIGHBOR_FOUND)
02829 return str;
02830 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02831
02832 enc_succ_char(s, l, enc);
02833 }
02834 if (!rb_enc_asciicompat(enc)) {
02835 MEMCPY(carry, s, char, l);
02836 carry_len = l;
02837 }
02838 carry_pos = s - sbeg;
02839 }
02840 }
02841 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02842 s = RSTRING_PTR(str) + carry_pos;
02843 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02844 memmove(s, carry, carry_len);
02845 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02846 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02847 rb_enc_str_coderange(str);
02848 return str;
02849 }
02850
02851
02852
02853
02854
02855
02856
02857
02858
02859
02860
02861 static VALUE
02862 rb_str_succ_bang(VALUE str)
02863 {
02864 rb_str_shared_replace(str, rb_str_succ(str));
02865
02866 return str;
02867 }
02868
02869
02870
02871
02872
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891
02892
02893
02894
02895
02896
02897
02898
02899
02900
02901
02902 static VALUE
02903 rb_str_upto(int argc, VALUE *argv, VALUE beg)
02904 {
02905 VALUE end, exclusive;
02906 VALUE current, after_end;
02907 ID succ;
02908 int n, excl, ascii;
02909 rb_encoding *enc;
02910
02911 rb_scan_args(argc, argv, "11", &end, &exclusive);
02912 RETURN_ENUMERATOR(beg, argc, argv);
02913 excl = RTEST(exclusive);
02914 CONST_ID(succ, "succ");
02915 StringValue(end);
02916 enc = rb_enc_check(beg, end);
02917 ascii = (is_ascii_string(beg) && is_ascii_string(end));
02918
02919 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
02920 char c = RSTRING_PTR(beg)[0];
02921 char e = RSTRING_PTR(end)[0];
02922
02923 if (c > e || (excl && c == e)) return beg;
02924 for (;;) {
02925 rb_yield(rb_enc_str_new(&c, 1, enc));
02926 if (!excl && c == e) break;
02927 c++;
02928 if (excl && c == e) break;
02929 }
02930 return beg;
02931 }
02932
02933 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
02934 char *s, *send;
02935 VALUE b, e;
02936 int width;
02937
02938 s = RSTRING_PTR(beg); send = RSTRING_END(beg);
02939 width = rb_long2int(send - s);
02940 while (s < send) {
02941 if (!ISDIGIT(*s)) goto no_digits;
02942 s++;
02943 }
02944 s = RSTRING_PTR(end); send = RSTRING_END(end);
02945 while (s < send) {
02946 if (!ISDIGIT(*s)) goto no_digits;
02947 s++;
02948 }
02949 b = rb_str_to_inum(beg, 10, FALSE);
02950 e = rb_str_to_inum(end, 10, FALSE);
02951 if (FIXNUM_P(b) && FIXNUM_P(e)) {
02952 long bi = FIX2LONG(b);
02953 long ei = FIX2LONG(e);
02954 rb_encoding *usascii = rb_usascii_encoding();
02955
02956 while (bi <= ei) {
02957 if (excl && bi == ei) break;
02958 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
02959 bi++;
02960 }
02961 }
02962 else {
02963 ID op = excl ? '<' : rb_intern("<=");
02964 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
02965
02966 args[0] = INT2FIX(width);
02967 while (rb_funcall(b, op, 1, e)) {
02968 args[1] = b;
02969 rb_yield(rb_str_format(numberof(args), args, fmt));
02970 b = rb_funcall(b, succ, 0, 0);
02971 }
02972 }
02973 return beg;
02974 }
02975
02976 no_digits:
02977 n = rb_str_cmp(beg, end);
02978 if (n > 0 || (excl && n == 0)) return beg;
02979
02980 after_end = rb_funcall(end, succ, 0, 0);
02981 current = rb_str_dup(beg);
02982 while (!rb_str_equal(current, after_end)) {
02983 VALUE next = Qnil;
02984 if (excl || !rb_str_equal(current, end))
02985 next = rb_funcall(current, succ, 0, 0);
02986 rb_yield(current);
02987 if (NIL_P(next)) break;
02988 current = next;
02989 StringValue(current);
02990 if (excl && rb_str_equal(current, end)) break;
02991 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
02992 break;
02993 }
02994
02995 return beg;
02996 }
02997
02998 static VALUE
02999 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03000 {
03001 if (rb_reg_search(re, str, 0, 0) >= 0) {
03002 VALUE match = rb_backref_get();
03003 int nth = rb_reg_backref_number(match, backref);
03004 return rb_reg_nth_match(nth, match);
03005 }
03006 return Qnil;
03007 }
03008
03009 static VALUE
03010 rb_str_aref(VALUE str, VALUE indx)
03011 {
03012 long idx;
03013
03014 switch (TYPE(indx)) {
03015 case T_FIXNUM:
03016 idx = FIX2LONG(indx);
03017
03018 num_index:
03019 str = rb_str_substr(str, idx, 1);
03020 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03021 return str;
03022
03023 case T_REGEXP:
03024 return rb_str_subpat(str, indx, INT2FIX(0));
03025
03026 case T_STRING:
03027 if (rb_str_index(str, indx, 0) != -1)
03028 return rb_str_dup(indx);
03029 return Qnil;
03030
03031 default:
03032
03033 {
03034 long beg, len;
03035 VALUE tmp;
03036
03037 len = str_strlen(str, STR_ENC_GET(str));
03038 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03039 case Qfalse:
03040 break;
03041 case Qnil:
03042 return Qnil;
03043 default:
03044 tmp = rb_str_substr(str, beg, len);
03045 return tmp;
03046 }
03047 }
03048 idx = NUM2LONG(indx);
03049 goto num_index;
03050 }
03051 return Qnil;
03052 }
03053
03054
03055
03056
03057
03058
03059
03060
03061
03062
03063
03064
03065
03066
03067
03068
03069
03070
03071
03072
03073
03074
03075
03076
03077
03078
03079
03080
03081
03082
03083
03084
03085
03086
03087
03088
03089
03090
03091
03092
03093
03094
03095
03096
03097
03098
03099
03100
03101
03102
03103 static VALUE
03104 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03105 {
03106 if (argc == 2) {
03107 if (TYPE(argv[0]) == T_REGEXP) {
03108 return rb_str_subpat(str, argv[0], argv[1]);
03109 }
03110 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03111 }
03112 if (argc != 1) {
03113 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03114 }
03115 return rb_str_aref(str, argv[0]);
03116 }
03117
03118 VALUE
03119 rb_str_drop_bytes(VALUE str, long len)
03120 {
03121 char *ptr = RSTRING_PTR(str);
03122 long olen = RSTRING_LEN(str), nlen;
03123
03124 str_modifiable(str);
03125 if (len > olen) len = olen;
03126 nlen = olen - len;
03127 if (nlen <= RSTRING_EMBED_LEN_MAX) {
03128 char *oldptr = ptr;
03129 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03130 STR_SET_EMBED(str);
03131 STR_SET_EMBED_LEN(str, nlen);
03132 ptr = RSTRING(str)->as.ary;
03133 memmove(ptr, oldptr + len, nlen);
03134 if (fl == STR_NOEMBED) xfree(oldptr);
03135 }
03136 else {
03137 if (!STR_SHARED_P(str)) rb_str_new4(str);
03138 ptr = RSTRING(str)->as.heap.ptr += len;
03139 RSTRING(str)->as.heap.len = nlen;
03140 }
03141 ptr[nlen] = 0;
03142 ENC_CODERANGE_CLEAR(str);
03143 return str;
03144 }
03145
03146 static void
03147 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03148 {
03149 if (beg == 0 && RSTRING_LEN(val) == 0) {
03150 rb_str_drop_bytes(str, len);
03151 OBJ_INFECT(str, val);
03152 return;
03153 }
03154
03155 rb_str_modify(str);
03156 if (len < RSTRING_LEN(val)) {
03157
03158 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03159 }
03160
03161 if (RSTRING_LEN(val) != len) {
03162 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03163 RSTRING_PTR(str) + beg + len,
03164 RSTRING_LEN(str) - (beg + len));
03165 }
03166 if (RSTRING_LEN(val) < beg && len < 0) {
03167 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03168 }
03169 if (RSTRING_LEN(val) > 0) {
03170 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03171 }
03172 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03173 if (RSTRING_PTR(str)) {
03174 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03175 }
03176 OBJ_INFECT(str, val);
03177 }
03178
03179 static void
03180 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03181 {
03182 long slen;
03183 char *p, *e;
03184 rb_encoding *enc;
03185 int singlebyte = single_byte_optimizable(str);
03186 int cr;
03187
03188 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03189
03190 StringValue(val);
03191 enc = rb_enc_check(str, val);
03192 slen = str_strlen(str, enc);
03193
03194 if (slen < beg) {
03195 out_of_range:
03196 rb_raise(rb_eIndexError, "index %ld out of string", beg);
03197 }
03198 if (beg < 0) {
03199 if (-beg > slen) {
03200 goto out_of_range;
03201 }
03202 beg += slen;
03203 }
03204 if (slen < len || slen < beg + len) {
03205 len = slen - beg;
03206 }
03207 str_modify_keep_cr(str);
03208 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03209 if (!p) p = RSTRING_END(str);
03210 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03211 if (!e) e = RSTRING_END(str);
03212
03213 beg = p - RSTRING_PTR(str);
03214 len = e - p;
03215 rb_str_splice_0(str, beg, len, val);
03216 rb_enc_associate(str, enc);
03217 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03218 if (cr != ENC_CODERANGE_BROKEN)
03219 ENC_CODERANGE_SET(str, cr);
03220 }
03221
03222 void
03223 rb_str_update(VALUE str, long beg, long len, VALUE val)
03224 {
03225 rb_str_splice(str, beg, len, val);
03226 }
03227
03228 static void
03229 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03230 {
03231 int nth;
03232 VALUE match;
03233 long start, end, len;
03234 rb_encoding *enc;
03235 struct re_registers *regs;
03236
03237 if (rb_reg_search(re, str, 0, 0) < 0) {
03238 rb_raise(rb_eIndexError, "regexp not matched");
03239 }
03240 match = rb_backref_get();
03241 nth = rb_reg_backref_number(match, backref);
03242 regs = RMATCH_REGS(match);
03243 if (nth >= regs->num_regs) {
03244 out_of_range:
03245 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03246 }
03247 if (nth < 0) {
03248 if (-nth >= regs->num_regs) {
03249 goto out_of_range;
03250 }
03251 nth += regs->num_regs;
03252 }
03253
03254 start = BEG(nth);
03255 if (start == -1) {
03256 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03257 }
03258 end = END(nth);
03259 len = end - start;
03260 StringValue(val);
03261 enc = rb_enc_check(str, val);
03262 rb_str_splice_0(str, start, len, val);
03263 rb_enc_associate(str, enc);
03264 }
03265
03266 static VALUE
03267 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03268 {
03269 long idx, beg;
03270
03271 switch (TYPE(indx)) {
03272 case T_FIXNUM:
03273 idx = FIX2LONG(indx);
03274 num_index:
03275 rb_str_splice(str, idx, 1, val);
03276 return val;
03277
03278 case T_REGEXP:
03279 rb_str_subpat_set(str, indx, INT2FIX(0), val);
03280 return val;
03281
03282 case T_STRING:
03283 beg = rb_str_index(str, indx, 0);
03284 if (beg < 0) {
03285 rb_raise(rb_eIndexError, "string not matched");
03286 }
03287 beg = rb_str_sublen(str, beg);
03288 rb_str_splice(str, beg, str_strlen(indx, 0), val);
03289 return val;
03290
03291 default:
03292
03293 {
03294 long beg, len;
03295 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03296 rb_str_splice(str, beg, len, val);
03297 return val;
03298 }
03299 }
03300 idx = NUM2LONG(indx);
03301 goto num_index;
03302 }
03303 }
03304
03305
03306
03307
03308
03309
03310
03311
03312
03313
03314
03315
03316
03317
03318
03319
03320
03321
03322
03323
03324
03325
03326
03327
03328
03329
03330 static VALUE
03331 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03332 {
03333 if (argc == 3) {
03334 if (TYPE(argv[0]) == T_REGEXP) {
03335 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03336 }
03337 else {
03338 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03339 }
03340 return argv[2];
03341 }
03342 if (argc != 2) {
03343 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03344 }
03345 return rb_str_aset(str, argv[0], argv[1]);
03346 }
03347
03348
03349
03350
03351
03352
03353
03354
03355
03356
03357
03358
03359
03360
03361
03362
03363
03364
03365 static VALUE
03366 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03367 {
03368 long pos = NUM2LONG(idx);
03369
03370 if (pos == -1) {
03371 return rb_str_append(str, str2);
03372 }
03373 else if (pos < 0) {
03374 pos++;
03375 }
03376 rb_str_splice(str, pos, 0, str2);
03377 return str;
03378 }
03379
03380
03381
03382
03383
03384
03385
03386
03387
03388
03389
03390
03391
03392
03393
03394
03395
03396
03397
03398
03399
03400 static VALUE
03401 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03402 {
03403 VALUE result;
03404 VALUE buf[3];
03405 int i;
03406
03407 if (argc < 1 || 2 < argc) {
03408 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03409 }
03410 for (i=0; i<argc; i++) {
03411 buf[i] = argv[i];
03412 }
03413 str_modify_keep_cr(str);
03414 buf[i] = rb_str_new(0,0);
03415 result = rb_str_aref_m(argc, buf, str);
03416 if (!NIL_P(result)) {
03417 rb_str_aset_m(argc+1, buf, str);
03418 }
03419 return result;
03420 }
03421
03422 static VALUE
03423 get_pat(VALUE pat, int quote)
03424 {
03425 VALUE val;
03426
03427 switch (TYPE(pat)) {
03428 case T_REGEXP:
03429 return pat;
03430
03431 case T_STRING:
03432 break;
03433
03434 default:
03435 val = rb_check_string_type(pat);
03436 if (NIL_P(val)) {
03437 Check_Type(pat, T_REGEXP);
03438 }
03439 pat = val;
03440 }
03441
03442 if (quote) {
03443 pat = rb_reg_quote(pat);
03444 }
03445
03446 return rb_reg_regcomp(pat);
03447 }
03448
03449
03450
03451
03452
03453
03454
03455
03456
03457
03458
03459
03460 static VALUE
03461 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03462 {
03463 VALUE pat, repl, hash = Qnil;
03464 int iter = 0;
03465 int tainted = 0;
03466 int untrusted = 0;
03467 long plen;
03468
03469 if (argc == 1 && rb_block_given_p()) {
03470 iter = 1;
03471 }
03472 else if (argc == 2) {
03473 repl = argv[1];
03474 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03475 if (NIL_P(hash)) {
03476 StringValue(repl);
03477 }
03478 if (OBJ_TAINTED(repl)) tainted = 1;
03479 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03480 }
03481 else {
03482 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03483 }
03484
03485 pat = get_pat(argv[0], 1);
03486 str_modifiable(str);
03487 if (rb_reg_search(pat, str, 0, 0) >= 0) {
03488 rb_encoding *enc;
03489 int cr = ENC_CODERANGE(str);
03490 VALUE match = rb_backref_get();
03491 struct re_registers *regs = RMATCH_REGS(match);
03492 long beg0 = BEG(0);
03493 long end0 = END(0);
03494 char *p, *rp;
03495 long len, rlen;
03496
03497 if (iter || !NIL_P(hash)) {
03498 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03499
03500 if (iter) {
03501 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03502 }
03503 else {
03504 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03505 repl = rb_obj_as_string(repl);
03506 }
03507 str_mod_check(str, p, len);
03508 str_frozen_check(str);
03509 }
03510 else {
03511 repl = rb_reg_regsub(repl, str, regs, pat);
03512 }
03513 enc = rb_enc_compatible(str, repl);
03514 if (!enc) {
03515 rb_encoding *str_enc = STR_ENC_GET(str);
03516 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03517 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03518 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03519 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03520 rb_enc_name(str_enc),
03521 rb_enc_name(STR_ENC_GET(repl)));
03522 }
03523 enc = STR_ENC_GET(repl);
03524 }
03525 rb_str_modify(str);
03526 rb_enc_associate(str, enc);
03527 if (OBJ_TAINTED(repl)) tainted = 1;
03528 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03529 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03530 int cr2 = ENC_CODERANGE(repl);
03531 if (cr2 == ENC_CODERANGE_BROKEN ||
03532 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03533 cr = ENC_CODERANGE_UNKNOWN;
03534 else
03535 cr = cr2;
03536 }
03537 plen = end0 - beg0;
03538 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03539 len = RSTRING_LEN(str);
03540 if (rlen > plen) {
03541 RESIZE_CAPA(str, len + rlen - plen);
03542 }
03543 p = RSTRING_PTR(str);
03544 if (rlen != plen) {
03545 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03546 }
03547 memcpy(p + beg0, rp, rlen);
03548 len += rlen - plen;
03549 STR_SET_LEN(str, len);
03550 RSTRING_PTR(str)[len] = '\0';
03551 ENC_CODERANGE_SET(str, cr);
03552 if (tainted) OBJ_TAINT(str);
03553 if (untrusted) OBJ_UNTRUST(str);
03554
03555 return str;
03556 }
03557 return Qnil;
03558 }
03559
03560
03561
03562
03563
03564
03565
03566
03567
03568
03569
03570
03571
03572
03573
03574
03575
03576
03577
03578
03579
03580
03581
03582
03583
03584
03585
03586
03587
03588
03589
03590
03591
03592
03593
03594
03595
03596
03597
03598
03599
03600
03601 static VALUE
03602 rb_str_sub(int argc, VALUE *argv, VALUE str)
03603 {
03604 str = rb_str_dup(str);
03605 rb_str_sub_bang(argc, argv, str);
03606 return str;
03607 }
03608
03609 static VALUE
03610 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03611 {
03612 VALUE pat, val, repl, match, dest, hash = Qnil;
03613 struct re_registers *regs;
03614 long beg, n;
03615 long beg0, end0;
03616 long offset, blen, slen, len, last;
03617 int iter = 0;
03618 char *sp, *cp;
03619 int tainted = 0;
03620 rb_encoding *str_enc;
03621
03622 switch (argc) {
03623 case 1:
03624 RETURN_ENUMERATOR(str, argc, argv);
03625 iter = 1;
03626 break;
03627 case 2:
03628 repl = argv[1];
03629 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03630 if (NIL_P(hash)) {
03631 StringValue(repl);
03632 }
03633 if (OBJ_TAINTED(repl)) tainted = 1;
03634 break;
03635 default:
03636 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03637 }
03638
03639 pat = get_pat(argv[0], 1);
03640 beg = rb_reg_search(pat, str, 0, 0);
03641 if (beg < 0) {
03642 if (bang) return Qnil;
03643 return rb_str_dup(str);
03644 }
03645
03646 offset = 0;
03647 n = 0;
03648 blen = RSTRING_LEN(str) + 30;
03649 dest = rb_str_buf_new(blen);
03650 sp = RSTRING_PTR(str);
03651 slen = RSTRING_LEN(str);
03652 cp = sp;
03653 str_enc = STR_ENC_GET(str);
03654
03655 do {
03656 n++;
03657 match = rb_backref_get();
03658 regs = RMATCH_REGS(match);
03659 beg0 = BEG(0);
03660 end0 = END(0);
03661 if (iter || !NIL_P(hash)) {
03662 if (iter) {
03663 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03664 }
03665 else {
03666 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03667 val = rb_obj_as_string(val);
03668 }
03669 str_mod_check(str, sp, slen);
03670 if (val == dest) {
03671 rb_raise(rb_eRuntimeError, "block should not cheat");
03672 }
03673 }
03674 else {
03675 val = rb_reg_regsub(repl, str, regs, pat);
03676 }
03677
03678 if (OBJ_TAINTED(val)) tainted = 1;
03679
03680 len = beg - offset;
03681 if (len) {
03682 rb_enc_str_buf_cat(dest, cp, len, str_enc);
03683 }
03684
03685 rb_str_buf_append(dest, val);
03686
03687 last = offset;
03688 offset = end0;
03689 if (beg0 == end0) {
03690
03691
03692
03693
03694 if (RSTRING_LEN(str) <= end0) break;
03695 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03696 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03697 offset = end0 + len;
03698 }
03699 cp = RSTRING_PTR(str) + offset;
03700 if (offset > RSTRING_LEN(str)) break;
03701 beg = rb_reg_search(pat, str, offset, 0);
03702 } while (beg >= 0);
03703 if (RSTRING_LEN(str) > offset) {
03704 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03705 }
03706 rb_reg_search(pat, str, last, 0);
03707 if (bang) {
03708 rb_str_shared_replace(str, dest);
03709 }
03710 else {
03711 RBASIC(dest)->klass = rb_obj_class(str);
03712 OBJ_INFECT(dest, str);
03713 str = dest;
03714 }
03715
03716 if (tainted) OBJ_TAINT(str);
03717 return str;
03718 }
03719
03720
03721
03722
03723
03724
03725
03726
03727
03728
03729
03730
03731
03732 static VALUE
03733 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03734 {
03735 str_modify_keep_cr(str);
03736 return str_gsub(argc, argv, str, 1);
03737 }
03738
03739
03740
03741
03742
03743
03744
03745
03746
03747
03748
03749
03750
03751
03752
03753
03754
03755
03756
03757
03758
03759
03760
03761
03762
03763
03764
03765
03766
03767
03768
03769
03770
03771
03772
03773
03774
03775
03776
03777
03778
03779
03780
03781
03782
03783 static VALUE
03784 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03785 {
03786 return str_gsub(argc, argv, str, 0);
03787 }
03788
03789
03790
03791
03792
03793
03794
03795
03796
03797
03798
03799
03800
03801 VALUE
03802 rb_str_replace(VALUE str, VALUE str2)
03803 {
03804 str_modifiable(str);
03805 if (str == str2) return str;
03806
03807 StringValue(str2);
03808 str_discard(str);
03809 return str_replace(str, str2);
03810 }
03811
03812
03813
03814
03815
03816
03817
03818
03819
03820
03821
03822 static VALUE
03823 rb_str_clear(VALUE str)
03824 {
03825 str_discard(str);
03826 STR_SET_EMBED(str);
03827 STR_SET_EMBED_LEN(str, 0);
03828 RSTRING_PTR(str)[0] = 0;
03829 if (rb_enc_asciicompat(STR_ENC_GET(str)))
03830 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03831 else
03832 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03833 return str;
03834 }
03835
03836
03837
03838
03839
03840
03841
03842
03843
03844
03845
03846 static VALUE
03847 rb_str_chr(VALUE str)
03848 {
03849 return rb_str_substr(str, 0, 1);
03850 }
03851
03852
03853
03854
03855
03856
03857
03858 static VALUE
03859 rb_str_getbyte(VALUE str, VALUE index)
03860 {
03861 long pos = NUM2LONG(index);
03862
03863 if (pos < 0)
03864 pos += RSTRING_LEN(str);
03865 if (pos < 0 || RSTRING_LEN(str) <= pos)
03866 return Qnil;
03867
03868 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03869 }
03870
03871
03872
03873
03874
03875
03876
03877 static VALUE
03878 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
03879 {
03880 long pos = NUM2LONG(index);
03881 int byte = NUM2INT(value);
03882
03883 rb_str_modify(str);
03884
03885 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
03886 rb_raise(rb_eIndexError, "index %ld out of string", pos);
03887 if (pos < 0)
03888 pos += RSTRING_LEN(str);
03889
03890 RSTRING_PTR(str)[pos] = byte;
03891
03892 return value;
03893 }
03894
03895
03896
03897
03898
03899
03900
03901
03902
03903
03904 static VALUE
03905 rb_str_reverse(VALUE str)
03906 {
03907 rb_encoding *enc;
03908 VALUE rev;
03909 char *s, *e, *p;
03910 int single = 1;
03911
03912 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
03913 enc = STR_ENC_GET(str);
03914 rev = rb_str_new5(str, 0, RSTRING_LEN(str));
03915 s = RSTRING_PTR(str); e = RSTRING_END(str);
03916 p = RSTRING_END(rev);
03917
03918 if (RSTRING_LEN(str) > 1) {
03919 if (single_byte_optimizable(str)) {
03920 while (s < e) {
03921 *--p = *s++;
03922 }
03923 }
03924 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
03925 while (s < e) {
03926 int clen = rb_enc_fast_mbclen(s, e, enc);
03927
03928 if (clen > 1 || (*s & 0x80)) single = 0;
03929 p -= clen;
03930 memcpy(p, s, clen);
03931 s += clen;
03932 }
03933 }
03934 else {
03935 while (s < e) {
03936 int clen = rb_enc_mbclen(s, e, enc);
03937
03938 if (clen > 1 || (*s & 0x80)) single = 0;
03939 p -= clen;
03940 memcpy(p, s, clen);
03941 s += clen;
03942 }
03943 }
03944 }
03945 STR_SET_LEN(rev, RSTRING_LEN(str));
03946 OBJ_INFECT(rev, str);
03947 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
03948 if (single) {
03949 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03950 }
03951 else {
03952 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03953 }
03954 }
03955 rb_enc_cr_str_copy_for_substr(rev, str);
03956
03957 return rev;
03958 }
03959
03960
03961
03962
03963
03964
03965
03966
03967
03968 static VALUE
03969 rb_str_reverse_bang(VALUE str)
03970 {
03971 if (RSTRING_LEN(str) > 1) {
03972 if (single_byte_optimizable(str)) {
03973 char *s, *e, c;
03974
03975 str_modify_keep_cr(str);
03976 s = RSTRING_PTR(str);
03977 e = RSTRING_END(str) - 1;
03978 while (s < e) {
03979 c = *s;
03980 *s++ = *e;
03981 *e-- = c;
03982 }
03983 }
03984 else {
03985 rb_str_shared_replace(str, rb_str_reverse(str));
03986 }
03987 }
03988 else {
03989 str_modify_keep_cr(str);
03990 }
03991 return str;
03992 }
03993
03994
03995
03996
03997
03998
03999
04000
04001
04002
04003
04004
04005
04006
04007 static VALUE
04008 rb_str_include(VALUE str, VALUE arg)
04009 {
04010 long i;
04011
04012 StringValue(arg);
04013 i = rb_str_index(str, arg, 0);
04014
04015 if (i == -1) return Qfalse;
04016 return Qtrue;
04017 }
04018
04019
04020
04021
04022
04023
04024
04025
04026
04027
04028
04029
04030
04031
04032
04033
04034
04035
04036
04037
04038
04039
04040
04041 static VALUE
04042 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04043 {
04044 int base;
04045
04046 if (argc == 0) base = 10;
04047 else {
04048 VALUE b;
04049
04050 rb_scan_args(argc, argv, "01", &b);
04051 base = NUM2INT(b);
04052 }
04053 if (base < 0) {
04054 rb_raise(rb_eArgError, "invalid radix %d", base);
04055 }
04056 return rb_str_to_inum(str, base, FALSE);
04057 }
04058
04059
04060
04061
04062
04063
04064
04065
04066
04067
04068
04069
04070
04071
04072
04073
04074 static VALUE
04075 rb_str_to_f(VALUE str)
04076 {
04077 return DBL2NUM(rb_str_to_dbl(str, FALSE));
04078 }
04079
04080
04081
04082
04083
04084
04085
04086
04087
04088
04089 static VALUE
04090 rb_str_to_s(VALUE str)
04091 {
04092 if (rb_obj_class(str) != rb_cString) {
04093 return str_duplicate(rb_cString, str);
04094 }
04095 return str;
04096 }
04097
04098 #if 0
04099 static void
04100 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04101 {
04102 char s[RUBY_MAX_CHAR_LEN];
04103 int n = rb_enc_codelen(c, enc);
04104
04105 rb_enc_mbcput(c, s, enc);
04106 rb_enc_str_buf_cat(str, s, n, enc);
04107 }
04108 #endif
04109
04110 #define CHAR_ESC_LEN 13
04111
04112 int
04113 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04114 {
04115 char buf[CHAR_ESC_LEN + 1];
04116 int l;
04117
04118 #if SIZEOF_INT > 4
04119 c &= 0xffffffff;
04120 #endif
04121 if (unicode_p) {
04122 if (c < 0x7F && ISPRINT(c)) {
04123 snprintf(buf, CHAR_ESC_LEN, "%c", c);
04124 }
04125 else if (c < 0x10000) {
04126 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04127 }
04128 else {
04129 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04130 }
04131 }
04132 else {
04133 if (c < 0x100) {
04134 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04135 }
04136 else {
04137 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04138 }
04139 }
04140 l = (int)strlen(buf);
04141 rb_str_buf_cat(result, buf, l);
04142 return l;
04143 }
04144
04145
04146
04147
04148
04149
04150
04151
04152
04153
04154
04155
04156
04157 VALUE
04158 rb_str_inspect(VALUE str)
04159 {
04160 rb_encoding *enc = STR_ENC_GET(str);
04161 const char *p, *pend, *prev;
04162 char buf[CHAR_ESC_LEN + 1];
04163 VALUE result = rb_str_buf_new(0);
04164 rb_encoding *resenc = rb_default_internal_encoding();
04165 int unicode_p = rb_enc_unicode_p(enc);
04166 int asciicompat = rb_enc_asciicompat(enc);
04167
04168 if (resenc == NULL) resenc = rb_default_external_encoding();
04169 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04170 rb_enc_associate(result, resenc);
04171 str_buf_cat2(result, "\"");
04172
04173 p = RSTRING_PTR(str); pend = RSTRING_END(str);
04174 prev = p;
04175 while (p < pend) {
04176 unsigned int c, cc;
04177 int n;
04178
04179 n = rb_enc_precise_mbclen(p, pend, enc);
04180 if (!MBCLEN_CHARFOUND_P(n)) {
04181 if (p > prev) str_buf_cat(result, prev, p - prev);
04182 n = rb_enc_mbminlen(enc);
04183 if (pend < p + n)
04184 n = (int)(pend - p);
04185 while (n--) {
04186 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04187 str_buf_cat(result, buf, strlen(buf));
04188 prev = ++p;
04189 }
04190 continue;
04191 }
04192 n = MBCLEN_CHARFOUND_LEN(n);
04193 c = rb_enc_mbc_to_codepoint(p, pend, enc);
04194 p += n;
04195 if (c == '"'|| c == '\\' ||
04196 (c == '#' &&
04197 p < pend &&
04198 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04199 (cc = rb_enc_codepoint(p,pend,enc),
04200 (cc == '$' || cc == '@' || cc == '{')))) {
04201 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04202 str_buf_cat2(result, "\\");
04203 if (asciicompat || enc == resenc) {
04204 prev = p - n;
04205 continue;
04206 }
04207 }
04208 switch (c) {
04209 case '\n': cc = 'n'; break;
04210 case '\r': cc = 'r'; break;
04211 case '\t': cc = 't'; break;
04212 case '\f': cc = 'f'; break;
04213 case '\013': cc = 'v'; break;
04214 case '\010': cc = 'b'; break;
04215 case '\007': cc = 'a'; break;
04216 case 033: cc = 'e'; break;
04217 default: cc = 0; break;
04218 }
04219 if (cc) {
04220 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04221 buf[0] = '\\';
04222 buf[1] = (char)cc;
04223 str_buf_cat(result, buf, 2);
04224 prev = p;
04225 continue;
04226 }
04227 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04228 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04229 continue;
04230 }
04231 else {
04232 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04233 rb_str_buf_cat_escaped_char(result, c, unicode_p);
04234 prev = p;
04235 continue;
04236 }
04237 }
04238 if (p > prev) str_buf_cat(result, prev, p - prev);
04239 str_buf_cat2(result, "\"");
04240
04241 OBJ_INFECT(result, str);
04242 return result;
04243 }
04244
04245 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04246
04247
04248
04249
04250
04251
04252
04253
04254
04255 VALUE
04256 rb_str_dump(VALUE str)
04257 {
04258 rb_encoding *enc = rb_enc_get(str);
04259 long len;
04260 const char *p, *pend;
04261 char *q, *qend;
04262 VALUE result;
04263 int u8 = (enc == rb_utf8_encoding());
04264
04265 len = 2;
04266 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04267 while (p < pend) {
04268 unsigned char c = *p++;
04269 switch (c) {
04270 case '"': case '\\':
04271 case '\n': case '\r':
04272 case '\t': case '\f':
04273 case '\013': case '\010': case '\007': case '\033':
04274 len += 2;
04275 break;
04276
04277 case '#':
04278 len += IS_EVSTR(p, pend) ? 2 : 1;
04279 break;
04280
04281 default:
04282 if (ISPRINT(c)) {
04283 len++;
04284 }
04285 else {
04286 if (u8) {
04287 int n = rb_enc_precise_mbclen(p-1, pend, enc);
04288 if (MBCLEN_CHARFOUND_P(n-1)) {
04289 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04290 while (cc >>= 4) len++;
04291 len += 5;
04292 p += MBCLEN_CHARFOUND_LEN(n)-1;
04293 break;
04294 }
04295 }
04296 len += 4;
04297 }
04298 break;
04299 }
04300 }
04301 if (!rb_enc_asciicompat(enc)) {
04302 len += 19;
04303 len += strlen(enc->name);
04304 }
04305
04306 result = rb_str_new5(str, 0, len);
04307 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04308 q = RSTRING_PTR(result); qend = q + len + 1;
04309
04310 *q++ = '"';
04311 while (p < pend) {
04312 unsigned char c = *p++;
04313
04314 if (c == '"' || c == '\\') {
04315 *q++ = '\\';
04316 *q++ = c;
04317 }
04318 else if (c == '#') {
04319 if (IS_EVSTR(p, pend)) *q++ = '\\';
04320 *q++ = '#';
04321 }
04322 else if (c == '\n') {
04323 *q++ = '\\';
04324 *q++ = 'n';
04325 }
04326 else if (c == '\r') {
04327 *q++ = '\\';
04328 *q++ = 'r';
04329 }
04330 else if (c == '\t') {
04331 *q++ = '\\';
04332 *q++ = 't';
04333 }
04334 else if (c == '\f') {
04335 *q++ = '\\';
04336 *q++ = 'f';
04337 }
04338 else if (c == '\013') {
04339 *q++ = '\\';
04340 *q++ = 'v';
04341 }
04342 else if (c == '\010') {
04343 *q++ = '\\';
04344 *q++ = 'b';
04345 }
04346 else if (c == '\007') {
04347 *q++ = '\\';
04348 *q++ = 'a';
04349 }
04350 else if (c == '\033') {
04351 *q++ = '\\';
04352 *q++ = 'e';
04353 }
04354 else if (ISPRINT(c)) {
04355 *q++ = c;
04356 }
04357 else {
04358 *q++ = '\\';
04359 if (u8) {
04360 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04361 if (MBCLEN_CHARFOUND_P(n)) {
04362 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04363 p += n;
04364 snprintf(q, qend-q, "u{%x}", cc);
04365 q += strlen(q);
04366 continue;
04367 }
04368 }
04369 snprintf(q, qend-q, "x%02X", c);
04370 q += 3;
04371 }
04372 }
04373 *q++ = '"';
04374 *q = '\0';
04375 if (!rb_enc_asciicompat(enc)) {
04376 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04377 enc = rb_ascii8bit_encoding();
04378 }
04379 OBJ_INFECT(result, str);
04380
04381 rb_enc_associate(result, enc);
04382 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04383 return result;
04384 }
04385
04386
04387 static void
04388 rb_str_check_dummy_enc(rb_encoding *enc)
04389 {
04390 if (rb_enc_dummy_p(enc)) {
04391 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04392 rb_enc_name(enc));
04393 }
04394 }
04395
04396
04397
04398
04399
04400
04401
04402
04403
04404
04405 static VALUE
04406 rb_str_upcase_bang(VALUE str)
04407 {
04408 rb_encoding *enc;
04409 char *s, *send;
04410 int modify = 0;
04411 int n;
04412
04413 str_modify_keep_cr(str);
04414 enc = STR_ENC_GET(str);
04415 rb_str_check_dummy_enc(enc);
04416 s = RSTRING_PTR(str); send = RSTRING_END(str);
04417 if (single_byte_optimizable(str)) {
04418 while (s < send) {
04419 unsigned int c = *(unsigned char*)s;
04420
04421 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04422 *s = 'A' + (c - 'a');
04423 modify = 1;
04424 }
04425 s++;
04426 }
04427 }
04428 else {
04429 int ascompat = rb_enc_asciicompat(enc);
04430
04431 while (s < send) {
04432 unsigned int c;
04433
04434 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04435 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04436 *s = 'A' + (c - 'a');
04437 modify = 1;
04438 }
04439 s++;
04440 }
04441 else {
04442 c = rb_enc_codepoint_len(s, send, &n, enc);
04443 if (rb_enc_islower(c, enc)) {
04444
04445 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04446 modify = 1;
04447 }
04448 s += n;
04449 }
04450 }
04451 }
04452
04453 if (modify) return str;
04454 return Qnil;
04455 }
04456
04457
04458
04459
04460
04461
04462
04463
04464
04465
04466
04467
04468
04469
04470 static VALUE
04471 rb_str_upcase(VALUE str)
04472 {
04473 str = rb_str_dup(str);
04474 rb_str_upcase_bang(str);
04475 return str;
04476 }
04477
04478
04479
04480
04481
04482
04483
04484
04485
04486
04487
04488 static VALUE
04489 rb_str_downcase_bang(VALUE str)
04490 {
04491 rb_encoding *enc;
04492 char *s, *send;
04493 int modify = 0;
04494
04495 str_modify_keep_cr(str);
04496 enc = STR_ENC_GET(str);
04497 rb_str_check_dummy_enc(enc);
04498 s = RSTRING_PTR(str); send = RSTRING_END(str);
04499 if (single_byte_optimizable(str)) {
04500 while (s < send) {
04501 unsigned int c = *(unsigned char*)s;
04502
04503 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04504 *s = 'a' + (c - 'A');
04505 modify = 1;
04506 }
04507 s++;
04508 }
04509 }
04510 else {
04511 int ascompat = rb_enc_asciicompat(enc);
04512
04513 while (s < send) {
04514 unsigned int c;
04515 int n;
04516
04517 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04518 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04519 *s = 'a' + (c - 'A');
04520 modify = 1;
04521 }
04522 s++;
04523 }
04524 else {
04525 c = rb_enc_codepoint_len(s, send, &n, enc);
04526 if (rb_enc_isupper(c, enc)) {
04527
04528 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04529 modify = 1;
04530 }
04531 s += n;
04532 }
04533 }
04534 }
04535
04536 if (modify) return str;
04537 return Qnil;
04538 }
04539
04540
04541
04542
04543
04544
04545
04546
04547
04548
04549
04550
04551
04552
04553 static VALUE
04554 rb_str_downcase(VALUE str)
04555 {
04556 str = rb_str_dup(str);
04557 rb_str_downcase_bang(str);
04558 return str;
04559 }
04560
04561
04562
04563
04564
04565
04566
04567
04568
04569
04570
04571
04572
04573
04574
04575
04576 static VALUE
04577 rb_str_capitalize_bang(VALUE str)
04578 {
04579 rb_encoding *enc;
04580 char *s, *send;
04581 int modify = 0;
04582 unsigned int c;
04583 int n;
04584
04585 str_modify_keep_cr(str);
04586 enc = STR_ENC_GET(str);
04587 rb_str_check_dummy_enc(enc);
04588 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04589 s = RSTRING_PTR(str); send = RSTRING_END(str);
04590
04591 c = rb_enc_codepoint_len(s, send, &n, enc);
04592 if (rb_enc_islower(c, enc)) {
04593 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04594 modify = 1;
04595 }
04596 s += n;
04597 while (s < send) {
04598 c = rb_enc_codepoint_len(s, send, &n, enc);
04599 if (rb_enc_isupper(c, enc)) {
04600 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04601 modify = 1;
04602 }
04603 s += n;
04604 }
04605
04606 if (modify) return str;
04607 return Qnil;
04608 }
04609
04610
04611
04612
04613
04614
04615
04616
04617
04618
04619
04620
04621
04622
04623
04624 static VALUE
04625 rb_str_capitalize(VALUE str)
04626 {
04627 str = rb_str_dup(str);
04628 rb_str_capitalize_bang(str);
04629 return str;
04630 }
04631
04632
04633
04634
04635
04636
04637
04638
04639
04640
04641
04642 static VALUE
04643 rb_str_swapcase_bang(VALUE str)
04644 {
04645 rb_encoding *enc;
04646 char *s, *send;
04647 int modify = 0;
04648 int n;
04649
04650 str_modify_keep_cr(str);
04651 enc = STR_ENC_GET(str);
04652 rb_str_check_dummy_enc(enc);
04653 s = RSTRING_PTR(str); send = RSTRING_END(str);
04654 while (s < send) {
04655 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04656
04657 if (rb_enc_isupper(c, enc)) {
04658
04659 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04660 modify = 1;
04661 }
04662 else if (rb_enc_islower(c, enc)) {
04663
04664 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04665 modify = 1;
04666 }
04667 s += n;
04668 }
04669
04670 if (modify) return str;
04671 return Qnil;
04672 }
04673
04674
04675
04676
04677
04678
04679
04680
04681
04682
04683
04684
04685
04686
04687 static VALUE
04688 rb_str_swapcase(VALUE str)
04689 {
04690 str = rb_str_dup(str);
04691 rb_str_swapcase_bang(str);
04692 return str;
04693 }
04694
04695 typedef unsigned char *USTR;
04696
04697 struct tr {
04698 int gen;
04699 unsigned int now, max;
04700 char *p, *pend;
04701 };
04702
04703 static unsigned int
04704 trnext(struct tr *t, rb_encoding *enc)
04705 {
04706 int n;
04707
04708 for (;;) {
04709 if (!t->gen) {
04710 if (t->p == t->pend) return -1;
04711 if (t->p < t->pend - 1 && *t->p == '\\') {
04712 t->p++;
04713 }
04714 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04715 t->p += n;
04716 if (t->p < t->pend - 1 && *t->p == '-') {
04717 t->p++;
04718 if (t->p < t->pend) {
04719 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04720 t->p += n;
04721 if (t->now > c) {
04722 if (t->now < 0x80 && c < 0x80) {
04723 rb_raise(rb_eArgError,
04724 "invalid range \"%c-%c\" in string transliteration",
04725 t->now, c);
04726 }
04727 else {
04728 rb_raise(rb_eArgError, "invalid range in string transliteration");
04729 }
04730 continue;
04731 }
04732 t->gen = 1;
04733 t->max = c;
04734 }
04735 }
04736 return t->now;
04737 }
04738 else if (++t->now < t->max) {
04739 return t->now;
04740 }
04741 else {
04742 t->gen = 0;
04743 return t->max;
04744 }
04745 }
04746 }
04747
04748 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
04749
04750 static VALUE
04751 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
04752 {
04753 const unsigned int errc = -1;
04754 unsigned int trans[256];
04755 rb_encoding *enc, *e1, *e2;
04756 struct tr trsrc, trrepl;
04757 int cflag = 0;
04758 unsigned int c, c0;
04759 int last = 0, modify = 0, i, l;
04760 char *s, *send;
04761 VALUE hash = 0;
04762 int singlebyte = single_byte_optimizable(str);
04763 int cr;
04764
04765 #define CHECK_IF_ASCII(c) \
04766 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
04767 (cr = ENC_CODERANGE_VALID) : 0)
04768
04769 StringValue(src);
04770 StringValue(repl);
04771 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04772 if (RSTRING_LEN(repl) == 0) {
04773 return rb_str_delete_bang(1, &src, str);
04774 }
04775
04776 cr = ENC_CODERANGE(str);
04777 e1 = rb_enc_check(str, src);
04778 e2 = rb_enc_check(str, repl);
04779 if (e1 == e2) {
04780 enc = e1;
04781 }
04782 else {
04783 enc = rb_enc_check(src, repl);
04784 }
04785 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
04786 if (RSTRING_LEN(src) > 1 &&
04787 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
04788 trsrc.p + l < trsrc.pend) {
04789 cflag = 1;
04790 trsrc.p += l;
04791 }
04792 trrepl.p = RSTRING_PTR(repl);
04793 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
04794 trsrc.gen = trrepl.gen = 0;
04795 trsrc.now = trrepl.now = 0;
04796 trsrc.max = trrepl.max = 0;
04797
04798 if (cflag) {
04799 for (i=0; i<256; i++) {
04800 trans[i] = 1;
04801 }
04802 while ((c = trnext(&trsrc, enc)) != errc) {
04803 if (c < 256) {
04804 trans[c] = errc;
04805 }
04806 else {
04807 if (!hash) hash = rb_hash_new();
04808 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
04809 }
04810 }
04811 while ((c = trnext(&trrepl, enc)) != errc)
04812 ;
04813 last = trrepl.now;
04814 for (i=0; i<256; i++) {
04815 if (trans[i] != errc) {
04816 trans[i] = last;
04817 }
04818 }
04819 }
04820 else {
04821 unsigned int r;
04822
04823 for (i=0; i<256; i++) {
04824 trans[i] = errc;
04825 }
04826 while ((c = trnext(&trsrc, enc)) != errc) {
04827 r = trnext(&trrepl, enc);
04828 if (r == errc) r = trrepl.now;
04829 if (c < 256) {
04830 trans[c] = r;
04831 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
04832 }
04833 else {
04834 if (!hash) hash = rb_hash_new();
04835 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
04836 }
04837 }
04838 }
04839
04840 if (cr == ENC_CODERANGE_VALID)
04841 cr = ENC_CODERANGE_7BIT;
04842 str_modify_keep_cr(str);
04843 s = RSTRING_PTR(str); send = RSTRING_END(str);
04844 if (sflag) {
04845 int clen, tlen;
04846 long offset, max = RSTRING_LEN(str);
04847 unsigned int save = -1;
04848 char *buf = ALLOC_N(char, max), *t = buf;
04849
04850 while (s < send) {
04851 int may_modify = 0;
04852
04853 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04854 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04855
04856 s += clen;
04857 if (c < 256) {
04858 c = trans[c];
04859 }
04860 else if (hash) {
04861 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04862 if (NIL_P(tmp)) {
04863 if (cflag) c = last;
04864 else c = errc;
04865 }
04866 else if (cflag) c = errc;
04867 else c = NUM2INT(tmp);
04868 }
04869 else {
04870 c = errc;
04871 }
04872 if (c != (unsigned int)-1) {
04873 if (save == c) {
04874 CHECK_IF_ASCII(c);
04875 continue;
04876 }
04877 save = c;
04878 tlen = rb_enc_codelen(c, enc);
04879 modify = 1;
04880 }
04881 else {
04882 save = -1;
04883 c = c0;
04884 if (enc != e1) may_modify = 1;
04885 }
04886 while (t - buf + tlen >= max) {
04887 offset = t - buf;
04888 max *= 2;
04889 REALLOC_N(buf, char, max);
04890 t = buf + offset;
04891 }
04892 rb_enc_mbcput(c, t, enc);
04893 if (may_modify && memcmp(s, t, tlen) != 0) {
04894 modify = 1;
04895 }
04896 CHECK_IF_ASCII(c);
04897 t += tlen;
04898 }
04899 if (!STR_EMBED_P(str)) {
04900 xfree(RSTRING(str)->as.heap.ptr);
04901 }
04902 *t = '\0';
04903 RSTRING(str)->as.heap.ptr = buf;
04904 RSTRING(str)->as.heap.len = t - buf;
04905 STR_SET_NOEMBED(str);
04906 RSTRING(str)->as.heap.aux.capa = max;
04907 }
04908 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
04909 while (s < send) {
04910 c = (unsigned char)*s;
04911 if (trans[c] != errc) {
04912 if (!cflag) {
04913 c = trans[c];
04914 *s = c;
04915 modify = 1;
04916 }
04917 else {
04918 *s = last;
04919 modify = 1;
04920 }
04921 }
04922 CHECK_IF_ASCII(c);
04923 s++;
04924 }
04925 }
04926 else {
04927 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
04928 long offset;
04929 char *buf = ALLOC_N(char, max), *t = buf;
04930
04931 while (s < send) {
04932 int may_modify = 0;
04933 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04934 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04935
04936 if (c < 256) {
04937 c = trans[c];
04938 }
04939 else if (hash) {
04940 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04941 if (NIL_P(tmp)) {
04942 if (cflag) c = last;
04943 else c = errc;
04944 }
04945 else if (cflag) c = errc;
04946 else c = NUM2INT(tmp);
04947 }
04948 else {
04949 c = errc;
04950 }
04951 if (c != errc) {
04952 tlen = rb_enc_codelen(c, enc);
04953 modify = 1;
04954 }
04955 else {
04956 c = c0;
04957 if (enc != e1) may_modify = 1;
04958 }
04959 while (t - buf + tlen >= max) {
04960 offset = t - buf;
04961 max *= 2;
04962 REALLOC_N(buf, char, max);
04963 t = buf + offset;
04964 }
04965 if (s != t) {
04966 rb_enc_mbcput(c, t, enc);
04967 if (may_modify && memcmp(s, t, tlen) != 0) {
04968 modify = 1;
04969 }
04970 }
04971 CHECK_IF_ASCII(c);
04972 s += clen;
04973 t += tlen;
04974 }
04975 if (!STR_EMBED_P(str)) {
04976 xfree(RSTRING(str)->as.heap.ptr);
04977 }
04978 *t = '\0';
04979 RSTRING(str)->as.heap.ptr = buf;
04980 RSTRING(str)->as.heap.len = t - buf;
04981 STR_SET_NOEMBED(str);
04982 RSTRING(str)->as.heap.aux.capa = max;
04983 }
04984
04985 if (modify) {
04986 if (cr != ENC_CODERANGE_BROKEN)
04987 ENC_CODERANGE_SET(str, cr);
04988 rb_enc_associate(str, enc);
04989 return str;
04990 }
04991 return Qnil;
04992 }
04993
04994
04995
04996
04997
04998
04999
05000
05001
05002
05003
05004 static VALUE
05005 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05006 {
05007 return tr_trans(str, src, repl, 0);
05008 }
05009
05010
05011
05012
05013
05014
05015
05016
05017
05018
05019
05020
05021
05022
05023
05024
05025
05026
05027
05028 static VALUE
05029 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05030 {
05031 str = rb_str_dup(str);
05032 tr_trans(str, src, repl, 0);
05033 return str;
05034 }
05035
05036 static void
05037 tr_setup_table(VALUE str, char stable[256], int first,
05038 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05039 {
05040 const unsigned int errc = -1;
05041 char buf[256];
05042 struct tr tr;
05043 unsigned int c;
05044 VALUE table = 0, ptable = 0;
05045 int i, l, cflag = 0;
05046
05047 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05048 tr.gen = tr.now = tr.max = 0;
05049
05050 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05051 cflag = 1;
05052 tr.p += l;
05053
05054 table = rb_hash_new();
05055 ptable = *ctablep;
05056 *ctablep = table;
05057 }
05058 else {
05059 table = rb_hash_new();
05060 ptable = *tablep;
05061 *tablep = table;
05062 }
05063 if (first) {
05064 for (i=0; i<256; i++) {
05065 stable[i] = 1;
05066 }
05067 }
05068 for (i=0; i<256; i++) {
05069 buf[i] = cflag;
05070 }
05071
05072 while ((c = trnext(&tr, enc)) != errc) {
05073 if (c < 256) {
05074 buf[c & 0xff] = !cflag;
05075 }
05076 else {
05077 VALUE key = UINT2NUM(c);
05078
05079 if (!table) {
05080 table = rb_hash_new();
05081 ptable = *tablep;
05082 *tablep = table;
05083 }
05084 if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05085 rb_hash_aset(table, key, Qtrue);
05086 }
05087 }
05088 }
05089 for (i=0; i<256; i++) {
05090 stable[i] = stable[i] && buf[i];
05091 }
05092 }
05093
05094
05095 static int
05096 tr_find(unsigned int c, char table[256], VALUE del, VALUE nodel)
05097 {
05098 if (c < 256) {
05099 return table[c] != 0;
05100 }
05101 else {
05102 VALUE v = UINT2NUM(c);
05103
05104 if (del) {
05105 if (!NIL_P(rb_hash_lookup(del, v)) &&
05106 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05107 return TRUE;
05108 }
05109 }
05110 else if (nodel && NIL_P(rb_hash_lookup(nodel, v))) {
05111 return TRUE;
05112 }
05113 return FALSE;
05114 }
05115 }
05116
05117
05118
05119
05120
05121
05122
05123
05124
05125 static VALUE
05126 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05127 {
05128 char squeez[256];
05129 rb_encoding *enc = 0;
05130 char *s, *send, *t;
05131 VALUE del = 0, nodel = 0;
05132 int modify = 0;
05133 int i, ascompat, cr;
05134
05135 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05136 if (argc < 1) {
05137 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05138 }
05139 for (i=0; i<argc; i++) {
05140 VALUE s = argv[i];
05141
05142 StringValue(s);
05143 enc = rb_enc_check(str, s);
05144 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05145 }
05146
05147 str_modify_keep_cr(str);
05148 ascompat = rb_enc_asciicompat(enc);
05149 s = t = RSTRING_PTR(str);
05150 send = RSTRING_END(str);
05151 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05152 while (s < send) {
05153 unsigned int c;
05154 int clen;
05155
05156 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05157 if (squeez[c]) {
05158 modify = 1;
05159 }
05160 else {
05161 if (t != s) *t = c;
05162 t++;
05163 }
05164 s++;
05165 }
05166 else {
05167 c = rb_enc_codepoint_len(s, send, &clen, enc);
05168
05169 if (tr_find(c, squeez, del, nodel)) {
05170 modify = 1;
05171 }
05172 else {
05173 if (t != s) rb_enc_mbcput(c, t, enc);
05174 t += clen;
05175 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05176 }
05177 s += clen;
05178 }
05179 }
05180 *t = '\0';
05181 STR_SET_LEN(str, t - RSTRING_PTR(str));
05182 ENC_CODERANGE_SET(str, cr);
05183
05184 if (modify) return str;
05185 return Qnil;
05186 }
05187
05188
05189
05190
05191
05192
05193
05194
05195
05196
05197
05198
05199
05200
05201
05202
05203 static VALUE
05204 rb_str_delete(int argc, VALUE *argv, VALUE str)
05205 {
05206 str = rb_str_dup(str);
05207 rb_str_delete_bang(argc, argv, str);
05208 return str;
05209 }
05210
05211
05212
05213
05214
05215
05216
05217
05218
05219
05220 static VALUE
05221 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05222 {
05223 char squeez[256];
05224 rb_encoding *enc = 0;
05225 VALUE del = 0, nodel = 0;
05226 char *s, *send, *t;
05227 int i, modify = 0;
05228 int ascompat, singlebyte = single_byte_optimizable(str);
05229 unsigned int save;
05230
05231 if (argc == 0) {
05232 enc = STR_ENC_GET(str);
05233 }
05234 else {
05235 for (i=0; i<argc; i++) {
05236 VALUE s = argv[i];
05237
05238 StringValue(s);
05239 enc = rb_enc_check(str, s);
05240 if (singlebyte && !single_byte_optimizable(s))
05241 singlebyte = 0;
05242 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05243 }
05244 }
05245
05246 str_modify_keep_cr(str);
05247 s = t = RSTRING_PTR(str);
05248 if (!s || RSTRING_LEN(str) == 0) return Qnil;
05249 send = RSTRING_END(str);
05250 save = -1;
05251 ascompat = rb_enc_asciicompat(enc);
05252
05253 if (singlebyte) {
05254 while (s < send) {
05255 unsigned int c = *(unsigned char*)s++;
05256 if (c != save || (argc > 0 && !squeez[c])) {
05257 *t++ = save = c;
05258 }
05259 }
05260 } else {
05261 while (s < send) {
05262 unsigned int c;
05263 int clen;
05264
05265 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05266 if (c != save || (argc > 0 && !squeez[c])) {
05267 *t++ = save = c;
05268 }
05269 s++;
05270 }
05271 else {
05272 c = rb_enc_codepoint_len(s, send, &clen, enc);
05273
05274 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05275 if (t != s) rb_enc_mbcput(c, t, enc);
05276 save = c;
05277 t += clen;
05278 }
05279 s += clen;
05280 }
05281 }
05282 }
05283
05284 *t = '\0';
05285 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05286 STR_SET_LEN(str, t - RSTRING_PTR(str));
05287 modify = 1;
05288 }
05289
05290 if (modify) return str;
05291 return Qnil;
05292 }
05293
05294
05295
05296
05297
05298
05299
05300
05301
05302
05303
05304
05305
05306
05307
05308
05309
05310 static VALUE
05311 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05312 {
05313 str = rb_str_dup(str);
05314 rb_str_squeeze_bang(argc, argv, str);
05315 return str;
05316 }
05317
05318
05319
05320
05321
05322
05323
05324
05325
05326
05327 static VALUE
05328 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05329 {
05330 return tr_trans(str, src, repl, 1);
05331 }
05332
05333
05334
05335
05336
05337
05338
05339
05340
05341
05342
05343
05344
05345
05346
05347 static VALUE
05348 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05349 {
05350 str = rb_str_dup(str);
05351 tr_trans(str, src, repl, 1);
05352 return str;
05353 }
05354
05355
05356
05357
05358
05359
05360
05361
05362
05363
05364
05365
05366
05367
05368
05369
05370
05371
05372 static VALUE
05373 rb_str_count(int argc, VALUE *argv, VALUE str)
05374 {
05375 char table[256];
05376 rb_encoding *enc = 0;
05377 VALUE del = 0, nodel = 0;
05378 char *s, *send;
05379 int i;
05380 int ascompat;
05381
05382 if (argc < 1) {
05383 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05384 }
05385 for (i=0; i<argc; i++) {
05386 VALUE tstr = argv[i];
05387 unsigned char c;
05388
05389 StringValue(tstr);
05390 enc = rb_enc_check(str, tstr);
05391 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05392 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05393 int n = 0;
05394
05395 s = RSTRING_PTR(str);
05396 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05397 send = RSTRING_END(str);
05398 while (s < send) {
05399 if (*(unsigned char*)s++ == c) n++;
05400 }
05401 return INT2NUM(n);
05402 }
05403 tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05404 }
05405
05406 s = RSTRING_PTR(str);
05407 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05408 send = RSTRING_END(str);
05409 ascompat = rb_enc_asciicompat(enc);
05410 i = 0;
05411 while (s < send) {
05412 unsigned int c;
05413
05414 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05415 if (table[c]) {
05416 i++;
05417 }
05418 s++;
05419 }
05420 else {
05421 int clen;
05422 c = rb_enc_codepoint_len(s, send, &clen, enc);
05423 if (tr_find(c, table, del, nodel)) {
05424 i++;
05425 }
05426 s += clen;
05427 }
05428 }
05429
05430 return INT2NUM(i);
05431 }
05432
05433 static const char isspacetable[256] = {
05434 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05436 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05438 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05439 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05441 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05443 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05445 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05446 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05447 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05448 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05449 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05450 };
05451
05452 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05453
05454
05455
05456
05457
05458
05459
05460
05461
05462
05463
05464
05465
05466
05467
05468
05469
05470
05471
05472
05473
05474
05475
05476
05477
05478
05479
05480
05481
05482
05483
05484
05485
05486
05487
05488
05489
05490
05491
05492
05493
05494
05495
05496 static VALUE
05497 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05498 {
05499 rb_encoding *enc;
05500 VALUE spat;
05501 VALUE limit;
05502 enum {awk, string, regexp} split_type;
05503 long beg, end, i = 0;
05504 int lim = 0;
05505 VALUE result, tmp;
05506
05507 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05508 lim = NUM2INT(limit);
05509 if (lim <= 0) limit = Qnil;
05510 else if (lim == 1) {
05511 if (RSTRING_LEN(str) == 0)
05512 return rb_ary_new2(0);
05513 return rb_ary_new3(1, str);
05514 }
05515 i = 1;
05516 }
05517
05518 enc = STR_ENC_GET(str);
05519 if (NIL_P(spat)) {
05520 if (!NIL_P(rb_fs)) {
05521 spat = rb_fs;
05522 goto fs_set;
05523 }
05524 split_type = awk;
05525 }
05526 else {
05527 fs_set:
05528 if (TYPE(spat) == T_STRING) {
05529 rb_encoding *enc2 = STR_ENC_GET(spat);
05530
05531 split_type = string;
05532 if (RSTRING_LEN(spat) == 0) {
05533
05534 spat = rb_reg_regcomp(spat);
05535 split_type = regexp;
05536 }
05537 else if (rb_enc_asciicompat(enc2) == 1) {
05538 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05539 split_type = awk;
05540 }
05541 }
05542 else {
05543 int l;
05544 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05545 RSTRING_LEN(spat) == l) {
05546 split_type = awk;
05547 }
05548 }
05549 }
05550 else {
05551 spat = get_pat(spat, 1);
05552 split_type = regexp;
05553 }
05554 }
05555
05556 result = rb_ary_new();
05557 beg = 0;
05558 if (split_type == awk) {
05559 char *ptr = RSTRING_PTR(str);
05560 char *eptr = RSTRING_END(str);
05561 char *bptr = ptr;
05562 int skip = 1;
05563 unsigned int c;
05564
05565 end = beg;
05566 if (is_ascii_string(str)) {
05567 while (ptr < eptr) {
05568 c = (unsigned char)*ptr++;
05569 if (skip) {
05570 if (ascii_isspace(c)) {
05571 beg = ptr - bptr;
05572 }
05573 else {
05574 end = ptr - bptr;
05575 skip = 0;
05576 if (!NIL_P(limit) && lim <= i) break;
05577 }
05578 }
05579 else if (ascii_isspace(c)) {
05580 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05581 skip = 1;
05582 beg = ptr - bptr;
05583 if (!NIL_P(limit)) ++i;
05584 }
05585 else {
05586 end = ptr - bptr;
05587 }
05588 }
05589 }
05590 else {
05591 while (ptr < eptr) {
05592 int n;
05593
05594 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05595 ptr += n;
05596 if (skip) {
05597 if (rb_isspace(c)) {
05598 beg = ptr - bptr;
05599 }
05600 else {
05601 end = ptr - bptr;
05602 skip = 0;
05603 if (!NIL_P(limit) && lim <= i) break;
05604 }
05605 }
05606 else if (rb_isspace(c)) {
05607 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05608 skip = 1;
05609 beg = ptr - bptr;
05610 if (!NIL_P(limit)) ++i;
05611 }
05612 else {
05613 end = ptr - bptr;
05614 }
05615 }
05616 }
05617 }
05618 else if (split_type == string) {
05619 char *ptr = RSTRING_PTR(str);
05620 char *temp = ptr;
05621 char *eptr = RSTRING_END(str);
05622 char *sptr = RSTRING_PTR(spat);
05623 long slen = RSTRING_LEN(spat);
05624
05625 if (is_broken_string(str)) {
05626 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05627 }
05628 if (is_broken_string(spat)) {
05629 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05630 }
05631 enc = rb_enc_check(str, spat);
05632 while (ptr < eptr &&
05633 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05634
05635 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05636 if (t != ptr + end) {
05637 ptr = t;
05638 continue;
05639 }
05640 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05641 ptr += end + slen;
05642 if (!NIL_P(limit) && lim <= ++i) break;
05643 }
05644 beg = ptr - temp;
05645 }
05646 else {
05647 char *ptr = RSTRING_PTR(str);
05648 long len = RSTRING_LEN(str);
05649 long start = beg;
05650 long idx;
05651 int last_null = 0;
05652 struct re_registers *regs;
05653
05654 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05655 regs = RMATCH_REGS(rb_backref_get());
05656 if (start == end && BEG(0) == END(0)) {
05657 if (!ptr) {
05658 rb_ary_push(result, str_new_empty(str));
05659 break;
05660 }
05661 else if (last_null == 1) {
05662 rb_ary_push(result, rb_str_subseq(str, beg,
05663 rb_enc_fast_mbclen(ptr+beg,
05664 ptr+len,
05665 enc)));
05666 beg = start;
05667 }
05668 else {
05669 if (ptr+start == ptr+len)
05670 start++;
05671 else
05672 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05673 last_null = 1;
05674 continue;
05675 }
05676 }
05677 else {
05678 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05679 beg = start = END(0);
05680 }
05681 last_null = 0;
05682
05683 for (idx=1; idx < regs->num_regs; idx++) {
05684 if (BEG(idx) == -1) continue;
05685 if (BEG(idx) == END(idx))
05686 tmp = str_new_empty(str);
05687 else
05688 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05689 rb_ary_push(result, tmp);
05690 }
05691 if (!NIL_P(limit) && lim <= ++i) break;
05692 }
05693 }
05694 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05695 if (RSTRING_LEN(str) == beg)
05696 tmp = str_new_empty(str);
05697 else
05698 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05699 rb_ary_push(result, tmp);
05700 }
05701 if (NIL_P(limit) && lim == 0) {
05702 long len;
05703 while ((len = RARRAY_LEN(result)) > 0 &&
05704 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05705 rb_ary_pop(result);
05706 }
05707
05708 return result;
05709 }
05710
05711 VALUE
05712 rb_str_split(VALUE str, const char *sep0)
05713 {
05714 VALUE sep;
05715
05716 StringValue(str);
05717 sep = rb_str_new2(sep0);
05718 return rb_str_split_m(1, &sep, str);
05719 }
05720
05721
05722
05723
05724
05725
05726
05727
05728
05729
05730
05731
05732
05733
05734
05735
05736
05737
05738
05739
05740
05741
05742
05743
05744
05745
05746
05747
05748
05749
05750
05751
05752
05753
05754
05755
05756
05757
05758
05759 static VALUE
05760 rb_str_each_line(int argc, VALUE *argv, VALUE str)
05761 {
05762 rb_encoding *enc;
05763 VALUE rs;
05764 unsigned int newline;
05765 const char *p, *pend, *s, *ptr;
05766 long len, rslen;
05767 VALUE line;
05768 int n;
05769 VALUE orig = str;
05770
05771 if (argc == 0) {
05772 rs = rb_rs;
05773 }
05774 else {
05775 rb_scan_args(argc, argv, "01", &rs);
05776 }
05777 RETURN_ENUMERATOR(str, argc, argv);
05778 if (NIL_P(rs)) {
05779 rb_yield(str);
05780 return orig;
05781 }
05782 str = rb_str_new4(str);
05783 ptr = p = s = RSTRING_PTR(str);
05784 pend = p + RSTRING_LEN(str);
05785 len = RSTRING_LEN(str);
05786 StringValue(rs);
05787 if (rs == rb_default_rs) {
05788 enc = rb_enc_get(str);
05789 while (p < pend) {
05790 char *p0;
05791
05792 p = memchr(p, '\n', pend - p);
05793 if (!p) break;
05794 p0 = rb_enc_left_char_head(s, p, pend, enc);
05795 if (!rb_enc_is_newline(p0, pend, enc)) {
05796 p++;
05797 continue;
05798 }
05799 p = p0 + rb_enc_mbclen(p0, pend, enc);
05800 line = rb_str_new5(str, s, p - s);
05801 OBJ_INFECT(line, str);
05802 rb_enc_cr_str_copy_for_substr(line, str);
05803 rb_yield(line);
05804 str_mod_check(str, ptr, len);
05805 s = p;
05806 }
05807 goto finish;
05808 }
05809
05810 enc = rb_enc_check(str, rs);
05811 rslen = RSTRING_LEN(rs);
05812 if (rslen == 0) {
05813 newline = '\n';
05814 }
05815 else {
05816 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
05817 }
05818
05819 while (p < pend) {
05820 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
05821
05822 again:
05823 if (rslen == 0 && c == newline) {
05824 p += n;
05825 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
05826 goto again;
05827 }
05828 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
05829 p += n;
05830 }
05831 p -= n;
05832 }
05833 if (c == newline &&
05834 (rslen <= 1 ||
05835 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
05836 line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
05837 OBJ_INFECT(line, str);
05838 rb_enc_cr_str_copy_for_substr(line, str);
05839 rb_yield(line);
05840 str_mod_check(str, ptr, len);
05841 s = p + (rslen ? rslen : n);
05842 }
05843 p += n;
05844 }
05845
05846 finish:
05847 if (s != pend) {
05848 line = rb_str_new5(str, s, pend - s);
05849 OBJ_INFECT(line, str);
05850 rb_enc_cr_str_copy_for_substr(line, str);
05851 rb_yield(line);
05852 }
05853
05854 return orig;
05855 }
05856
05857
05858
05859
05860
05861
05862
05863
05864
05865
05866
05867
05868
05869
05870
05871
05872
05873
05874
05875
05876 static VALUE
05877 rb_str_each_byte(VALUE str)
05878 {
05879 long i;
05880
05881 RETURN_ENUMERATOR(str, 0, 0);
05882 for (i=0; i<RSTRING_LEN(str); i++) {
05883 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
05884 }
05885 return str;
05886 }
05887
05888
05889
05890
05891
05892
05893
05894
05895
05896
05897
05898
05899
05900
05901
05902
05903
05904
05905
05906
05907 static VALUE
05908 rb_str_each_char(VALUE str)
05909 {
05910 VALUE orig = str;
05911 long i, len, n;
05912 const char *ptr;
05913 rb_encoding *enc;
05914
05915 RETURN_ENUMERATOR(str, 0, 0);
05916 str = rb_str_new4(str);
05917 ptr = RSTRING_PTR(str);
05918 len = RSTRING_LEN(str);
05919 enc = rb_enc_get(str);
05920 switch (ENC_CODERANGE(str)) {
05921 case ENC_CODERANGE_VALID:
05922 case ENC_CODERANGE_7BIT:
05923 for (i = 0; i < len; i += n) {
05924 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
05925 rb_yield(rb_str_subseq(str, i, n));
05926 }
05927 break;
05928 default:
05929 for (i = 0; i < len; i += n) {
05930 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
05931 rb_yield(rb_str_subseq(str, i, n));
05932 }
05933 }
05934 return orig;
05935 }
05936
05937
05938
05939
05940
05941
05942
05943
05944
05945
05946
05947
05948
05949
05950
05951
05952
05953
05954
05955
05956
05957
05958 static VALUE
05959 rb_str_each_codepoint(VALUE str)
05960 {
05961 VALUE orig = str;
05962 long len;
05963 int n;
05964 unsigned int c;
05965 const char *ptr, *end;
05966 rb_encoding *enc;
05967
05968 if (single_byte_optimizable(str)) return rb_str_each_byte(str);
05969 RETURN_ENUMERATOR(str, 0, 0);
05970 str = rb_str_new4(str);
05971 ptr = RSTRING_PTR(str);
05972 len = RSTRING_LEN(str);
05973 end = RSTRING_END(str);
05974 enc = STR_ENC_GET(str);
05975 while (ptr < end) {
05976 c = rb_enc_codepoint_len(ptr, end, &n, enc);
05977 rb_yield(UINT2NUM(c));
05978 ptr += n;
05979 }
05980 return orig;
05981 }
05982
05983 static long
05984 chopped_length(VALUE str)
05985 {
05986 rb_encoding *enc = STR_ENC_GET(str);
05987 const char *p, *p2, *beg, *end;
05988
05989 beg = RSTRING_PTR(str);
05990 end = beg + RSTRING_LEN(str);
05991 if (beg > end) return 0;
05992 p = rb_enc_prev_char(beg, end, end, enc);
05993 if (!p) return 0;
05994 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
05995 p2 = rb_enc_prev_char(beg, p, end, enc);
05996 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
05997 }
05998 return p - beg;
05999 }
06000
06001
06002
06003
06004
06005
06006
06007
06008
06009
06010 static VALUE
06011 rb_str_chop_bang(VALUE str)
06012 {
06013 str_modify_keep_cr(str);
06014 if (RSTRING_LEN(str) > 0) {
06015 long len;
06016 len = chopped_length(str);
06017 STR_SET_LEN(str, len);
06018 RSTRING_PTR(str)[len] = '\0';
06019 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06020 ENC_CODERANGE_CLEAR(str);
06021 }
06022 return str;
06023 }
06024 return Qnil;
06025 }
06026
06027
06028
06029
06030
06031
06032
06033
06034
06035
06036
06037
06038
06039
06040
06041
06042
06043
06044
06045 static VALUE
06046 rb_str_chop(VALUE str)
06047 {
06048 VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06049 rb_enc_cr_str_copy_for_substr(str2, str);
06050 OBJ_INFECT(str2, str);
06051 return str2;
06052 }
06053
06054
06055
06056
06057
06058
06059
06060
06061
06062
06063 static VALUE
06064 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06065 {
06066 rb_encoding *enc;
06067 VALUE rs;
06068 int newline;
06069 char *p, *pp, *e;
06070 long len, rslen;
06071
06072 str_modify_keep_cr(str);
06073 len = RSTRING_LEN(str);
06074 if (len == 0) return Qnil;
06075 p = RSTRING_PTR(str);
06076 e = p + len;
06077 if (argc == 0) {
06078 rs = rb_rs;
06079 if (rs == rb_default_rs) {
06080 smart_chomp:
06081 enc = rb_enc_get(str);
06082 if (rb_enc_mbminlen(enc) > 1) {
06083 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06084 if (rb_enc_is_newline(pp, e, enc)) {
06085 e = pp;
06086 }
06087 pp = e - rb_enc_mbminlen(enc);
06088 if (pp >= p) {
06089 pp = rb_enc_left_char_head(p, pp, e, enc);
06090 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06091 e = pp;
06092 }
06093 }
06094 if (e == RSTRING_END(str)) {
06095 return Qnil;
06096 }
06097 len = e - RSTRING_PTR(str);
06098 STR_SET_LEN(str, len);
06099 }
06100 else {
06101 if (RSTRING_PTR(str)[len-1] == '\n') {
06102 STR_DEC_LEN(str);
06103 if (RSTRING_LEN(str) > 0 &&
06104 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06105 STR_DEC_LEN(str);
06106 }
06107 }
06108 else if (RSTRING_PTR(str)[len-1] == '\r') {
06109 STR_DEC_LEN(str);
06110 }
06111 else {
06112 return Qnil;
06113 }
06114 }
06115 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06116 return str;
06117 }
06118 }
06119 else {
06120 rb_scan_args(argc, argv, "01", &rs);
06121 }
06122 if (NIL_P(rs)) return Qnil;
06123 StringValue(rs);
06124 rslen = RSTRING_LEN(rs);
06125 if (rslen == 0) {
06126 while (len>0 && p[len-1] == '\n') {
06127 len--;
06128 if (len>0 && p[len-1] == '\r')
06129 len--;
06130 }
06131 if (len < RSTRING_LEN(str)) {
06132 STR_SET_LEN(str, len);
06133 RSTRING_PTR(str)[len] = '\0';
06134 return str;
06135 }
06136 return Qnil;
06137 }
06138 if (rslen > len) return Qnil;
06139 newline = RSTRING_PTR(rs)[rslen-1];
06140 if (rslen == 1 && newline == '\n')
06141 goto smart_chomp;
06142
06143 enc = rb_enc_check(str, rs);
06144 if (is_broken_string(rs)) {
06145 return Qnil;
06146 }
06147 pp = e - rslen;
06148 if (p[len-1] == newline &&
06149 (rslen <= 1 ||
06150 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06151 if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06152 return Qnil;
06153 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06154 ENC_CODERANGE_CLEAR(str);
06155 }
06156 STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06157 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06158 return str;
06159 }
06160 return Qnil;
06161 }
06162
06163
06164
06165
06166
06167
06168
06169
06170
06171
06172
06173
06174
06175
06176
06177
06178
06179
06180
06181
06182
06183 static VALUE
06184 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06185 {
06186 str = rb_str_dup(str);
06187 rb_str_chomp_bang(argc, argv, str);
06188 return str;
06189 }
06190
06191
06192
06193
06194
06195
06196
06197
06198
06199
06200
06201
06202
06203 static VALUE
06204 rb_str_lstrip_bang(VALUE str)
06205 {
06206 rb_encoding *enc;
06207 char *s, *t, *e;
06208
06209 str_modify_keep_cr(str);
06210 enc = STR_ENC_GET(str);
06211 s = RSTRING_PTR(str);
06212 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06213 e = t = RSTRING_END(str);
06214
06215 while (s < e) {
06216 int n;
06217 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06218
06219 if (!rb_isspace(cc)) break;
06220 s += n;
06221 }
06222
06223 if (s > RSTRING_PTR(str)) {
06224 STR_SET_LEN(str, t-s);
06225 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06226 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06227 return str;
06228 }
06229 return Qnil;
06230 }
06231
06232
06233
06234
06235
06236
06237
06238
06239
06240
06241
06242
06243
06244 static VALUE
06245 rb_str_lstrip(VALUE str)
06246 {
06247 str = rb_str_dup(str);
06248 rb_str_lstrip_bang(str);
06249 return str;
06250 }
06251
06252
06253
06254
06255
06256
06257
06258
06259
06260
06261
06262
06263
06264
06265 static VALUE
06266 rb_str_rstrip_bang(VALUE str)
06267 {
06268 rb_encoding *enc;
06269 char *s, *t, *e;
06270
06271 str_modify_keep_cr(str);
06272 enc = STR_ENC_GET(str);
06273 rb_str_check_dummy_enc(enc);
06274 s = RSTRING_PTR(str);
06275 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06276 t = e = RSTRING_END(str);
06277
06278
06279 if (single_byte_optimizable(str)) {
06280 unsigned char c;
06281 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06282 }
06283 else {
06284 char *tp;
06285
06286 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06287 unsigned int c = rb_enc_codepoint(tp, e, enc);
06288 if (c && !rb_isspace(c)) break;
06289 t = tp;
06290 }
06291 }
06292 if (t < e) {
06293 long len = t-RSTRING_PTR(str);
06294
06295 STR_SET_LEN(str, len);
06296 RSTRING_PTR(str)[len] = '\0';
06297 return str;
06298 }
06299 return Qnil;
06300 }
06301
06302
06303
06304
06305
06306
06307
06308
06309
06310
06311
06312
06313
06314 static VALUE
06315 rb_str_rstrip(VALUE str)
06316 {
06317 str = rb_str_dup(str);
06318 rb_str_rstrip_bang(str);
06319 return str;
06320 }
06321
06322
06323
06324
06325
06326
06327
06328
06329
06330
06331 static VALUE
06332 rb_str_strip_bang(VALUE str)
06333 {
06334 VALUE l = rb_str_lstrip_bang(str);
06335 VALUE r = rb_str_rstrip_bang(str);
06336
06337 if (NIL_P(l) && NIL_P(r)) return Qnil;
06338 return str;
06339 }
06340
06341
06342
06343
06344
06345
06346
06347
06348
06349
06350
06351
06352 static VALUE
06353 rb_str_strip(VALUE str)
06354 {
06355 str = rb_str_dup(str);
06356 rb_str_strip_bang(str);
06357 return str;
06358 }
06359
06360 static VALUE
06361 scan_once(VALUE str, VALUE pat, long *start)
06362 {
06363 VALUE result, match;
06364 struct re_registers *regs;
06365 int i;
06366
06367 if (rb_reg_search(pat, str, *start, 0) >= 0) {
06368 match = rb_backref_get();
06369 regs = RMATCH_REGS(match);
06370 if (BEG(0) == END(0)) {
06371 rb_encoding *enc = STR_ENC_GET(str);
06372
06373
06374
06375 if (RSTRING_LEN(str) > END(0))
06376 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06377 RSTRING_END(str), enc);
06378 else
06379 *start = END(0)+1;
06380 }
06381 else {
06382 *start = END(0);
06383 }
06384 if (regs->num_regs == 1) {
06385 return rb_reg_nth_match(0, match);
06386 }
06387 result = rb_ary_new2(regs->num_regs);
06388 for (i=1; i < regs->num_regs; i++) {
06389 rb_ary_push(result, rb_reg_nth_match(i, match));
06390 }
06391
06392 return result;
06393 }
06394 return Qnil;
06395 }
06396
06397
06398
06399
06400
06401
06402
06403
06404
06405
06406
06407
06408
06409
06410
06411
06412
06413
06414
06415
06416
06417
06418
06419
06420
06421
06422
06423
06424
06425
06426
06427
06428
06429 static VALUE
06430 rb_str_scan(VALUE str, VALUE pat)
06431 {
06432 VALUE result;
06433 long start = 0;
06434 long last = -1, prev = 0;
06435 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06436
06437 pat = get_pat(pat, 1);
06438 if (!rb_block_given_p()) {
06439 VALUE ary = rb_ary_new();
06440
06441 while (!NIL_P(result = scan_once(str, pat, &start))) {
06442 last = prev;
06443 prev = start;
06444 rb_ary_push(ary, result);
06445 }
06446 if (last >= 0) rb_reg_search(pat, str, last, 0);
06447 return ary;
06448 }
06449
06450 while (!NIL_P(result = scan_once(str, pat, &start))) {
06451 last = prev;
06452 prev = start;
06453 rb_yield(result);
06454 str_mod_check(str, p, len);
06455 }
06456 if (last >= 0) rb_reg_search(pat, str, last, 0);
06457 return str;
06458 }
06459
06460
06461
06462
06463
06464
06465
06466
06467
06468
06469
06470
06471
06472
06473
06474
06475 static VALUE
06476 rb_str_hex(VALUE str)
06477 {
06478 rb_encoding *enc = rb_enc_get(str);
06479
06480 if (!rb_enc_asciicompat(enc)) {
06481 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06482 }
06483 return rb_str_to_inum(str, 16, FALSE);
06484 }
06485
06486
06487
06488
06489
06490
06491
06492
06493
06494
06495
06496
06497
06498
06499
06500
06501 static VALUE
06502 rb_str_oct(VALUE str)
06503 {
06504 rb_encoding *enc = rb_enc_get(str);
06505
06506 if (!rb_enc_asciicompat(enc)) {
06507 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06508 }
06509 return rb_str_to_inum(str, -8, FALSE);
06510 }
06511
06512
06513
06514
06515
06516
06517
06518
06519
06520
06521
06522
06523 static VALUE
06524 rb_str_crypt(VALUE str, VALUE salt)
06525 {
06526 extern char *crypt(const char *, const char *);
06527 VALUE result;
06528 const char *s, *saltp;
06529 #ifdef BROKEN_CRYPT
06530 char salt_8bit_clean[3];
06531 #endif
06532
06533 StringValue(salt);
06534 if (RSTRING_LEN(salt) < 2)
06535 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06536
06537 s = RSTRING_PTR(str);
06538 if (!s) s = "";
06539 saltp = RSTRING_PTR(salt);
06540 #ifdef BROKEN_CRYPT
06541 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06542 salt_8bit_clean[0] = saltp[0] & 0x7f;
06543 salt_8bit_clean[1] = saltp[1] & 0x7f;
06544 salt_8bit_clean[2] = '\0';
06545 saltp = salt_8bit_clean;
06546 }
06547 #endif
06548 result = rb_str_new2(crypt(s, saltp));
06549 OBJ_INFECT(result, str);
06550 OBJ_INFECT(result, salt);
06551 return result;
06552 }
06553
06554
06555
06556
06557
06558
06559
06560
06561
06562
06563
06564
06565
06566
06567
06568
06569
06570
06571
06572
06573
06574
06575 VALUE
06576 rb_str_intern(VALUE s)
06577 {
06578 VALUE str = RB_GC_GUARD(s);
06579 ID id;
06580
06581 id = rb_intern_str(str);
06582 return ID2SYM(id);
06583 }
06584
06585
06586
06587
06588
06589
06590
06591
06592
06593
06594
06595 VALUE
06596 rb_str_ord(VALUE s)
06597 {
06598 unsigned int c;
06599
06600 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06601 return UINT2NUM(c);
06602 }
06603
06604
06605
06606
06607
06608
06609
06610
06611
06612
06613
06614 static VALUE
06615 rb_str_sum(int argc, VALUE *argv, VALUE str)
06616 {
06617 VALUE vbits;
06618 int bits;
06619 char *ptr, *p, *pend;
06620 long len;
06621 VALUE sum = INT2FIX(0);
06622 unsigned long sum0 = 0;
06623
06624 if (argc == 0) {
06625 bits = 16;
06626 }
06627 else {
06628 rb_scan_args(argc, argv, "01", &vbits);
06629 bits = NUM2INT(vbits);
06630 }
06631 ptr = p = RSTRING_PTR(str);
06632 len = RSTRING_LEN(str);
06633 pend = p + len;
06634
06635 while (p < pend) {
06636 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06637 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06638 str_mod_check(str, ptr, len);
06639 sum0 = 0;
06640 }
06641 sum0 += (unsigned char)*p;
06642 p++;
06643 }
06644
06645 if (bits == 0) {
06646 if (sum0) {
06647 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06648 }
06649 }
06650 else {
06651 if (sum == INT2FIX(0)) {
06652 if (bits < (int)sizeof(long)*CHAR_BIT) {
06653 sum0 &= (((unsigned long)1)<<bits)-1;
06654 }
06655 sum = LONG2FIX(sum0);
06656 }
06657 else {
06658 VALUE mod;
06659
06660 if (sum0) {
06661 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06662 }
06663
06664 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06665 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06666 sum = rb_funcall(sum, '&', 1, mod);
06667 }
06668 }
06669 return sum;
06670 }
06671
06672 static VALUE
06673 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06674 {
06675 rb_encoding *enc;
06676 VALUE w;
06677 long width, len, flen = 1, fclen = 1;
06678 VALUE res;
06679 char *p;
06680 const char *f = " ";
06681 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06682 volatile VALUE pad;
06683 int singlebyte = 1, cr;
06684
06685 rb_scan_args(argc, argv, "11", &w, &pad);
06686 enc = STR_ENC_GET(str);
06687 width = NUM2LONG(w);
06688 if (argc == 2) {
06689 StringValue(pad);
06690 enc = rb_enc_check(str, pad);
06691 f = RSTRING_PTR(pad);
06692 flen = RSTRING_LEN(pad);
06693 fclen = str_strlen(pad, enc);
06694 singlebyte = single_byte_optimizable(pad);
06695 if (flen == 0 || fclen == 0) {
06696 rb_raise(rb_eArgError, "zero width padding");
06697 }
06698 }
06699 len = str_strlen(str, enc);
06700 if (width < 0 || len >= width) return rb_str_dup(str);
06701 n = width - len;
06702 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06703 rlen = n - llen;
06704 cr = ENC_CODERANGE(str);
06705 if (flen > 1) {
06706 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06707 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06708 }
06709 size = RSTRING_LEN(str);
06710 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06711 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06712 (len += llen2 + rlen2) >= LONG_MAX - size) {
06713 rb_raise(rb_eArgError, "argument too big");
06714 }
06715 len += size;
06716 res = rb_str_new5(str, 0, len);
06717 p = RSTRING_PTR(res);
06718 if (flen <= 1) {
06719 memset(p, *f, llen);
06720 p += llen;
06721 }
06722 else {
06723 while (llen >= fclen) {
06724 memcpy(p,f,flen);
06725 p += flen;
06726 llen -= fclen;
06727 }
06728 if (llen > 0) {
06729 memcpy(p, f, llen2);
06730 p += llen2;
06731 }
06732 }
06733 memcpy(p, RSTRING_PTR(str), size);
06734 p += size;
06735 if (flen <= 1) {
06736 memset(p, *f, rlen);
06737 p += rlen;
06738 }
06739 else {
06740 while (rlen >= fclen) {
06741 memcpy(p,f,flen);
06742 p += flen;
06743 rlen -= fclen;
06744 }
06745 if (rlen > 0) {
06746 memcpy(p, f, rlen2);
06747 p += rlen2;
06748 }
06749 }
06750 *p = '\0';
06751 STR_SET_LEN(res, p-RSTRING_PTR(res));
06752 OBJ_INFECT(res, str);
06753 if (!NIL_P(pad)) OBJ_INFECT(res, pad);
06754 rb_enc_associate(res, enc);
06755 if (argc == 2)
06756 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
06757 if (cr != ENC_CODERANGE_BROKEN)
06758 ENC_CODERANGE_SET(res, cr);
06759 return res;
06760 }
06761
06762
06763
06764
06765
06766
06767
06768
06769
06770
06771
06772
06773
06774
06775
06776 static VALUE
06777 rb_str_ljust(int argc, VALUE *argv, VALUE str)
06778 {
06779 return rb_str_justify(argc, argv, str, 'l');
06780 }
06781
06782
06783
06784
06785
06786
06787
06788
06789
06790
06791
06792
06793
06794
06795
06796 static VALUE
06797 rb_str_rjust(int argc, VALUE *argv, VALUE str)
06798 {
06799 return rb_str_justify(argc, argv, str, 'r');
06800 }
06801
06802
06803
06804
06805
06806
06807
06808
06809
06810
06811
06812
06813
06814
06815
06816 static VALUE
06817 rb_str_center(int argc, VALUE *argv, VALUE str)
06818 {
06819 return rb_str_justify(argc, argv, str, 'c');
06820 }
06821
06822
06823
06824
06825
06826
06827
06828
06829
06830
06831
06832
06833
06834
06835
06836
06837 static VALUE
06838 rb_str_partition(VALUE str, VALUE sep)
06839 {
06840 long pos;
06841 int regex = FALSE;
06842
06843 if (TYPE(sep) == T_REGEXP) {
06844 pos = rb_reg_search(sep, str, 0, 0);
06845 regex = TRUE;
06846 }
06847 else {
06848 VALUE tmp;
06849
06850 tmp = rb_check_string_type(sep);
06851 if (NIL_P(tmp)) {
06852 rb_raise(rb_eTypeError, "type mismatch: %s given",
06853 rb_obj_classname(sep));
06854 }
06855 sep = tmp;
06856 pos = rb_str_index(str, sep, 0);
06857 }
06858 if (pos < 0) {
06859 failed:
06860 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
06861 }
06862 if (regex) {
06863 sep = rb_str_subpat(str, sep, INT2FIX(0));
06864 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
06865 }
06866 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
06867 sep,
06868 rb_str_subseq(str, pos+RSTRING_LEN(sep),
06869 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
06870 }
06871
06872
06873
06874
06875
06876
06877
06878
06879
06880
06881
06882
06883
06884
06885
06886
06887 static VALUE
06888 rb_str_rpartition(VALUE str, VALUE sep)
06889 {
06890 long pos = RSTRING_LEN(str);
06891 int regex = FALSE;
06892
06893 if (TYPE(sep) == T_REGEXP) {
06894 pos = rb_reg_search(sep, str, pos, 1);
06895 regex = TRUE;
06896 }
06897 else {
06898 VALUE tmp;
06899
06900 tmp = rb_check_string_type(sep);
06901 if (NIL_P(tmp)) {
06902 rb_raise(rb_eTypeError, "type mismatch: %s given",
06903 rb_obj_classname(sep));
06904 }
06905 sep = tmp;
06906 pos = rb_str_sublen(str, pos);
06907 pos = rb_str_rindex(str, sep, pos);
06908 }
06909 if (pos < 0) {
06910 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
06911 }
06912 if (regex) {
06913 sep = rb_reg_nth_match(0, rb_backref_get());
06914 }
06915 return rb_ary_new3(3, rb_str_substr(str, 0, pos),
06916 sep,
06917 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
06918 }
06919
06920
06921
06922
06923
06924
06925
06926
06927
06928
06929
06930
06931
06932
06933
06934
06935
06936 static VALUE
06937 rb_str_start_with(int argc, VALUE *argv, VALUE str)
06938 {
06939 int i;
06940
06941 for (i=0; i<argc; i++) {
06942 VALUE tmp = rb_check_string_type(argv[i]);
06943 if (NIL_P(tmp)) continue;
06944 rb_enc_check(str, tmp);
06945 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06946 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06947 return Qtrue;
06948 }
06949 return Qfalse;
06950 }
06951
06952
06953
06954
06955
06956
06957
06958
06959 static VALUE
06960 rb_str_end_with(int argc, VALUE *argv, VALUE str)
06961 {
06962 int i;
06963 char *p, *s, *e;
06964 rb_encoding *enc;
06965
06966 for (i=0; i<argc; i++) {
06967 VALUE tmp = rb_check_string_type(argv[i]);
06968 if (NIL_P(tmp)) continue;
06969 enc = rb_enc_check(str, tmp);
06970 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06971 p = RSTRING_PTR(str);
06972 e = p + RSTRING_LEN(str);
06973 s = e - RSTRING_LEN(tmp);
06974 if (rb_enc_left_char_head(p, s, e, enc) != s)
06975 continue;
06976 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06977 return Qtrue;
06978 }
06979 return Qfalse;
06980 }
06981
06982 void
06983 rb_str_setter(VALUE val, ID id, VALUE *var)
06984 {
06985 if (!NIL_P(val) && TYPE(val) != T_STRING) {
06986 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
06987 }
06988 *var = val;
06989 }
06990
06991
06992
06993
06994
06995
06996
06997
06998
06999 static VALUE
07000 rb_str_force_encoding(VALUE str, VALUE enc)
07001 {
07002 str_modifiable(str);
07003 rb_enc_associate(str, rb_to_encoding(enc));
07004 ENC_CODERANGE_CLEAR(str);
07005 return str;
07006 }
07007
07008
07009
07010
07011
07012
07013
07014
07015
07016
07017
07018
07019 static VALUE
07020 rb_str_valid_encoding_p(VALUE str)
07021 {
07022 int cr = rb_enc_str_coderange(str);
07023
07024 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07025 }
07026
07027
07028
07029
07030
07031
07032
07033
07034
07035
07036
07037 static VALUE
07038 rb_str_is_ascii_only_p(VALUE str)
07039 {
07040 int cr = rb_enc_str_coderange(str);
07041
07042 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07043 }
07044
07045
07046
07047
07048
07049
07050
07051
07052
07053
07054
07055
07056
07057
07058
07059
07060
07061
07062
07063
07064
07065
07066
07067
07068
07069
07070
07071
07072
07073
07074
07075
07076
07077
07078
07079
07080
07081
07082
07083
07084
07085
07086
07087 static VALUE
07088 sym_equal(VALUE sym1, VALUE sym2)
07089 {
07090 if (sym1 == sym2) return Qtrue;
07091 return Qfalse;
07092 }
07093
07094
07095 static int
07096 sym_printable(const char *s, const char *send, rb_encoding *enc)
07097 {
07098 while (s < send) {
07099 int n;
07100 int c = rb_enc_codepoint_len(s, send, &n, enc);
07101
07102 if (!rb_enc_isprint(c, enc)) return FALSE;
07103 s += n;
07104 }
07105 return TRUE;
07106 }
07107
07108
07109
07110
07111
07112
07113
07114
07115
07116
07117 static VALUE
07118 sym_inspect(VALUE sym)
07119 {
07120 VALUE str;
07121 ID id = SYM2ID(sym);
07122 rb_encoding *enc;
07123 const char *ptr;
07124 long len;
07125 char *dest;
07126 rb_encoding *resenc = rb_default_internal_encoding();
07127
07128 if (resenc == NULL) resenc = rb_default_external_encoding();
07129 sym = rb_id2str(id);
07130 enc = STR_ENC_GET(sym);
07131 ptr = RSTRING_PTR(sym);
07132 len = RSTRING_LEN(sym);
07133 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07134 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07135 str = rb_str_inspect(sym);
07136 len = RSTRING_LEN(str);
07137 rb_str_resize(str, len + 1);
07138 dest = RSTRING_PTR(str);
07139 memmove(dest + 1, dest, len);
07140 dest[0] = ':';
07141 }
07142 else {
07143 char *dest;
07144 str = rb_enc_str_new(0, len + 1, enc);
07145 dest = RSTRING_PTR(str);
07146 dest[0] = ':';
07147 memcpy(dest + 1, ptr, len);
07148 }
07149 return str;
07150 }
07151
07152
07153
07154
07155
07156
07157
07158
07159
07160
07161
07162
07163
07164 VALUE
07165 rb_sym_to_s(VALUE sym)
07166 {
07167 ID id = SYM2ID(sym);
07168
07169 return str_new3(rb_cString, rb_id2str(id));
07170 }
07171
07172
07173
07174
07175
07176
07177
07178
07179
07180
07181
07182
07183 static VALUE
07184 sym_to_sym(VALUE sym)
07185 {
07186 return sym;
07187 }
07188
07189 VALUE rb_funcall_passing_block(VALUE recv, ID mid, int argc, const VALUE *argv);
07190
07191 static VALUE
07192 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
07193 {
07194 VALUE obj;
07195
07196 if (argc < 1) {
07197 rb_raise(rb_eArgError, "no receiver given");
07198 }
07199 obj = argv[0];
07200 return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1);
07201 }
07202
07203
07204
07205
07206
07207
07208
07209
07210
07211
07212 static VALUE
07213 sym_to_proc(VALUE sym)
07214 {
07215 static VALUE sym_proc_cache = Qfalse;
07216 enum {SYM_PROC_CACHE_SIZE = 67};
07217 VALUE proc;
07218 long id, index;
07219 VALUE *aryp;
07220
07221 if (!sym_proc_cache) {
07222 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07223 rb_gc_register_mark_object(sym_proc_cache);
07224 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07225 }
07226
07227 id = SYM2ID(sym);
07228 index = (id % SYM_PROC_CACHE_SIZE) << 1;
07229
07230 aryp = RARRAY_PTR(sym_proc_cache);
07231 if (aryp[index] == sym) {
07232 return aryp[index + 1];
07233 }
07234 else {
07235 proc = rb_proc_new(sym_call, (VALUE)id);
07236 aryp[index] = sym;
07237 aryp[index + 1] = proc;
07238 return proc;
07239 }
07240 }
07241
07242
07243
07244
07245
07246
07247
07248
07249
07250 static VALUE
07251 sym_succ(VALUE sym)
07252 {
07253 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07254 }
07255
07256
07257
07258
07259
07260
07261
07262
07263
07264 static VALUE
07265 sym_cmp(VALUE sym, VALUE other)
07266 {
07267 if (!SYMBOL_P(other)) {
07268 return Qnil;
07269 }
07270 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07271 }
07272
07273
07274
07275
07276
07277
07278
07279
07280
07281 static VALUE
07282 sym_casecmp(VALUE sym, VALUE other)
07283 {
07284 if (!SYMBOL_P(other)) {
07285 return Qnil;
07286 }
07287 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07288 }
07289
07290
07291
07292
07293
07294
07295
07296
07297 static VALUE
07298 sym_match(VALUE sym, VALUE other)
07299 {
07300 return rb_str_match(rb_sym_to_s(sym), other);
07301 }
07302
07303
07304
07305
07306
07307
07308
07309
07310
07311 static VALUE
07312 sym_aref(int argc, VALUE *argv, VALUE sym)
07313 {
07314 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07315 }
07316
07317
07318
07319
07320
07321
07322
07323
07324 static VALUE
07325 sym_length(VALUE sym)
07326 {
07327 return rb_str_length(rb_id2str(SYM2ID(sym)));
07328 }
07329
07330
07331
07332
07333
07334
07335
07336
07337 static VALUE
07338 sym_empty(VALUE sym)
07339 {
07340 return rb_str_empty(rb_id2str(SYM2ID(sym)));
07341 }
07342
07343
07344
07345
07346
07347
07348
07349
07350 static VALUE
07351 sym_upcase(VALUE sym)
07352 {
07353 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07354 }
07355
07356
07357
07358
07359
07360
07361
07362
07363 static VALUE
07364 sym_downcase(VALUE sym)
07365 {
07366 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07367 }
07368
07369
07370
07371
07372
07373
07374
07375
07376 static VALUE
07377 sym_capitalize(VALUE sym)
07378 {
07379 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07380 }
07381
07382
07383
07384
07385
07386
07387
07388
07389 static VALUE
07390 sym_swapcase(VALUE sym)
07391 {
07392 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07393 }
07394
07395
07396
07397
07398
07399
07400
07401
07402 static VALUE
07403 sym_encoding(VALUE sym)
07404 {
07405 return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07406 }
07407
07408 ID
07409 rb_to_id(VALUE name)
07410 {
07411 VALUE tmp;
07412 ID id;
07413
07414 switch (TYPE(name)) {
07415 default:
07416 tmp = rb_check_string_type(name);
07417 if (NIL_P(tmp)) {
07418 tmp = rb_inspect(name);
07419 rb_raise(rb_eTypeError, "%s is not a symbol",
07420 RSTRING_PTR(tmp));
07421 }
07422 name = tmp;
07423
07424 case T_STRING:
07425 name = rb_str_intern(name);
07426
07427 case T_SYMBOL:
07428 return SYM2ID(name);
07429 }
07430 return id;
07431 }
07432
07433
07434
07435
07436
07437
07438
07439
07440
07441
07442
07443
07444
07445
07446 void
07447 Init_String(void)
07448 {
07449 #undef rb_intern
07450 #define rb_intern(str) rb_intern_const(str)
07451
07452 rb_cString = rb_define_class("String", rb_cObject);
07453 rb_include_module(rb_cString, rb_mComparable);
07454 rb_define_alloc_func(rb_cString, str_alloc);
07455 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07456 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07457 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07458 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07459 rb_define_method(rb_cString, "==", rb_str_equal, 1);
07460 rb_define_method(rb_cString, "===", rb_str_equal, 1);
07461 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07462 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07463 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07464 rb_define_method(rb_cString, "+", rb_str_plus, 1);
07465 rb_define_method(rb_cString, "*", rb_str_times, 1);
07466 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07467 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07468 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07469 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07470 rb_define_method(rb_cString, "length", rb_str_length, 0);
07471 rb_define_method(rb_cString, "size", rb_str_length, 0);
07472 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07473 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07474 rb_define_method(rb_cString, "=~", rb_str_match, 1);
07475 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07476 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07477 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07478 rb_define_method(rb_cString, "next", rb_str_succ, 0);
07479 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07480 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07481 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07482 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07483 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07484 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07485 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07486 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07487 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07488
07489 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07490 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07491 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07492 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07493 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07494 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07495
07496 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07497 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07498 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07499 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07500
07501 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07502 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07503 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07504 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07505
07506 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07507 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07508 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07509 rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07510 rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07511 rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07512 rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07513 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07514 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07515 rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07516 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07517 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07518 rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07519 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07520 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07521
07522 rb_define_method(rb_cString, "include?", rb_str_include, 1);
07523 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07524 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07525
07526 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07527
07528 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07529 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07530 rb_define_method(rb_cString, "center", rb_str_center, -1);
07531
07532 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07533 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07534 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07535 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07536 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07537 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07538 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07539
07540 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07541 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07542 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07543 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07544 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07545 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07546 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07547
07548 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07549 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07550 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07551 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07552 rb_define_method(rb_cString, "count", rb_str_count, -1);
07553
07554 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07555 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07556 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07557 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07558
07559 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07560 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07561 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07562 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07563
07564 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07565
07566 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07567 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07568
07569 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07570 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07571
07572 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0);
07573 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07574 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07575 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07576
07577 id_to_s = rb_intern("to_s");
07578
07579 rb_fs = Qnil;
07580 rb_define_variable("$;", &rb_fs);
07581 rb_define_variable("$-F", &rb_fs);
07582
07583 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07584 rb_include_module(rb_cSymbol, rb_mComparable);
07585 rb_undef_alloc_func(rb_cSymbol);
07586 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07587 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0);
07588
07589 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07590 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07591 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07592 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07593 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07594 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07595 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07596 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07597 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07598 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07599
07600 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07601 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07602 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07603
07604 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07605 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07606 rb_define_method(rb_cSymbol, "length", sym_length, 0);
07607 rb_define_method(rb_cSymbol, "size", sym_length, 0);
07608 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07609 rb_define_method(rb_cSymbol, "match", sym_match, 1);
07610
07611 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07612 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07613 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07614 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07615
07616 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07617 }
07618