18 #define ENABLE_ECONV_NEWLINE_OPTION 1
32 #ifdef ENABLE_ECONV_NEWLINE_OPTION
45 static unsigned char *
47 const unsigned char *str,
size_t len,
48 unsigned char *caller_dst_buf,
size_t caller_dst_bufsize,
80 char ary[
sizeof(double) >
sizeof(
void*) ?
sizeof(double) :
sizeof(
void*)];
84 #define TRANSCODING_READBUF(tc) \
85 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
88 #define TRANSCODING_WRITEBUF(tc) \
89 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
90 (tc)->writebuf.ary : \
92 #define TRANSCODING_WRITEBUF_SIZE(tc) \
93 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
94 sizeof((tc)->writebuf.ary) : \
95 (size_t)(tc)->transcoder->max_output)
96 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
97 #define TRANSCODING_STATE(tc) \
98 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
154 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
178 entry->
sname = sname;
179 entry->
dname = dname;
230 #define MAX_TRANSCODER_LIBNAME_LEN 64
238 lib ? lib :
"(null)");
243 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
260 const char *dname = (
const char *)key;
280 void (*
callback)(
const char *sname,
const char *dname,
int depth,
void *
arg),
337 const char *enc = dname;
345 enc = (
const char *)val;
353 callback((
const char *)val, enc, --depth, arg);
354 enc = (
const char *)val;
370 const char *lib = entry->
lib;
380 memcpy(path, transcoder_lib_prefix,
sizeof(transcoder_lib_prefix) - 1);
381 memcpy(path +
sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
400 *repl_encname_ptr =
"UTF-8";
401 return "\xEF\xBF\xBD";
405 *repl_encname_ptr =
"US-ASCII";
414 static const unsigned char *
416 const unsigned char *in_start,
417 const unsigned char *inchar_start,
418 const unsigned char *in_p,
419 size_t *char_len_ptr)
421 const unsigned char *ptr;
422 if (inchar_start - in_start < tc->recognized_len) {
424 inchar_start,
unsigned char, in_p - inchar_start);
436 const unsigned char *in_stop,
unsigned char *out_stop,
442 ssize_t readagain_len = 0;
444 const unsigned char *inchar_start;
445 const unsigned char *in_p;
447 unsigned char *out_p;
449 in_p = inchar_start = *in_pos;
453 #define SUSPEND(ret, num) \
455 tc->resume_position = (num); \
456 if (0 < in_p - inchar_start) \
457 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
458 inchar_start, unsigned char, in_p - inchar_start); \
461 tc->recognized_len += in_p - inchar_start; \
462 if (readagain_len) { \
463 tc->recognized_len -= readagain_len; \
464 tc->readagain_len = readagain_len; \
467 resume_label ## num:; \
469 #define SUSPEND_OBUF(num) \
471 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
474 #define SUSPEND_AFTER_OUTPUT(num) \
475 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
476 SUSPEND(econv_after_output, num); \
479 #define next_table (tc->next_table)
480 #define next_info (tc->next_info)
481 #define next_byte (tc->next_byte)
482 #define writebuf_len (tc->writebuf_len)
483 #define writebuf_off (tc->writebuf_off)
487 case 1:
goto resume_label1;
488 case 2:
goto resume_label2;
489 case 3:
goto resume_label3;
490 case 4:
goto resume_label4;
491 case 5:
goto resume_label5;
492 case 6:
goto resume_label6;
493 case 7:
goto resume_label7;
494 case 8:
goto resume_label8;
495 case 9:
goto resume_label9;
496 case 10:
goto resume_label10;
497 case 11:
goto resume_label11;
498 case 12:
goto resume_label12;
499 case 13:
goto resume_label13;
500 case 14:
goto resume_label14;
501 case 15:
goto resume_label15;
502 case 16:
goto resume_label16;
503 case 17:
goto resume_label17;
504 case 18:
goto resume_label18;
505 case 19:
goto resume_label19;
506 case 20:
goto resume_label20;
507 case 21:
goto resume_label21;
508 case 22:
goto resume_label22;
509 case 23:
goto resume_label23;
510 case 24:
goto resume_label24;
511 case 25:
goto resume_label25;
512 case 26:
goto resume_label26;
513 case 27:
goto resume_label27;
514 case 28:
goto resume_label28;
515 case 29:
goto resume_label29;
516 case 30:
goto resume_label30;
517 case 31:
goto resume_label31;
518 case 32:
goto resume_label32;
519 case 33:
goto resume_label33;
520 case 34:
goto resume_label34;
530 if (in_stop <= in_p) {
537 #define BYTE_ADDR(index) (tr->byte_array + (index))
538 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
539 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
540 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
541 #define BL_MIN_BYTE (BL_BASE[0])
542 #define BL_MAX_BYTE (BL_BASE[1])
543 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
544 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
557 const unsigned char *
p = inchar_start;
570 case 0x00:
case 0x04:
case 0x08:
case 0x0C:
571 case 0x10:
case 0x14:
case 0x18:
case 0x1C:
573 while (in_p >= in_stop) {
619 const unsigned char *char_start;
643 const unsigned char *char_start;
649 char_start, (
size_t)char_len,
650 out_p, out_stop - out_p);
655 char_start, (
size_t)char_len,
667 const unsigned char *char_start;
674 out_p, out_stop - out_p);
708 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
709 readagain_len = invalid_len - discard_len;
737 out_p, out_stop - out_p);
761 const unsigned char *in_stop,
unsigned char *out_stop,
767 const unsigned char *readagain_pos = readagain_buf;
768 const unsigned char *readagain_stop = readagain_buf + tc->
readagain_len;
777 readagain_pos,
unsigned char, readagain_stop - readagain_pos);
793 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
814 const unsigned char **input_ptr,
const unsigned char *input_stop,
815 unsigned char **output_ptr,
unsigned char *output_stop,
819 input_ptr, output_ptr,
820 input_stop, output_stop,
831 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
846 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
941 for (i = 0; i < n; i++) {
950 for (i = 0; i < n; i++) {
1004 if (*sname ==
'\0' && *dname ==
'\0') {
1014 if (num_trans < 0) {
1025 ec->
flags = ecflags;
1032 #define MAX_ECFLAGS_DECORATORS 32
1055 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1056 decorators_ret[num_decorators++] =
"xml_text_escape";
1057 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
1058 decorators_ret[num_decorators++] =
"xml_attr_content_escape";
1060 decorators_ret[num_decorators++] =
"xml_attr_quote";
1063 decorators_ret[num_decorators++] =
"crlf_newline";
1065 decorators_ret[num_decorators++] =
"cr_newline";
1067 decorators_ret[num_decorators++] =
"universal_newline";
1069 return num_decorators;
1081 if (num_decorators == -1)
1088 for (i = 0; i < num_decorators; i++)
1094 ec->
flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1101 const unsigned char **input_ptr,
const unsigned char *input_stop,
1102 unsigned char **output_ptr,
unsigned char *output_stop,
1109 const unsigned char **ipp, *is, *iold;
1110 unsigned char **opp, *os, *oold;
1116 for (i = start; i < ec->
num_trans; i++) {
1150 flags &= ~ECONV_AFTER_OUTPUT;
1153 f &= ~ECONV_AFTER_OUTPUT;
1157 if (iold != *ipp || oold != *opp)
1182 const unsigned char **input_ptr,
const unsigned char *input_stop,
1183 unsigned char **output_ptr,
unsigned char *output_stop,
1185 int *result_position_ptr)
1188 int needreport_index;
1191 unsigned char empty_buf;
1192 unsigned char *empty_ptr = &empty_buf;
1195 input_ptr = (
const unsigned char **)&empty_ptr;
1196 input_stop = empty_ptr;
1200 output_ptr = &empty_ptr;
1201 output_stop = empty_ptr;
1207 needreport_index = -1;
1208 for (i = ec->
num_trans-1; 0 <= i; i--) {
1216 needreport_index =
i;
1217 goto found_needreport;
1224 rb_bug(
"unexpected transcode last result");
1236 result_position_ptr);
1248 needreport_index =
trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1249 sweep_start = needreport_index + 1;
1250 }
while (needreport_index != -1 && needreport_index != ec->
num_trans-1);
1252 for (i = ec->
num_trans-1; 0 <= i; i--) {
1261 if (result_position_ptr)
1262 *result_position_ptr =
i;
1266 if (result_position_ptr)
1267 *result_position_ptr = -1;
1273 const unsigned char **input_ptr,
const unsigned char *input_stop,
1274 unsigned char **output_ptr,
unsigned char *output_stop,
1278 int result_position;
1286 if (output_stop - *output_ptr < ec->in_data_end - ec->
in_data_start) {
1287 len = output_stop - *output_ptr;
1289 *output_ptr = output_stop;
1303 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1304 len = output_stop - *output_ptr;
1307 len = input_stop - *input_ptr;
1310 *(*output_ptr)++ = *(*input_ptr)++;
1314 memcpy(*output_ptr, *input_ptr, len);
1317 if (*input_ptr != input_stop)
1329 if (data_start != data_end) {
1331 if (output_stop - *output_ptr < data_end - data_start) {
1332 len = output_stop - *output_ptr;
1333 memcpy(*output_ptr, data_start, len);
1334 *output_ptr = output_stop;
1339 len = data_end - data_start;
1340 memcpy(*output_ptr, data_start, len);
1359 *input_ptr != input_stop) {
1360 input_stop = *input_ptr;
1361 res =
rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1365 else if ((flags & ECONV_AFTER_OUTPUT) ||
1367 res =
rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1372 res =
rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1399 unsigned char utfbuf[1024];
1400 const unsigned char *utf;
1402 int utf_allocated = 0;
1403 char charef_buf[16];
1404 const unsigned char *
p;
1413 utfbuf,
sizeof(utfbuf),
1421 if (utf_len % 4 != 0)
1425 while (4 <= utf_len) {
1431 snprintf(charef_buf,
sizeof(charef_buf),
"&#x%X;", u);
1453 const unsigned char **input_ptr,
const unsigned char *input_stop,
1454 unsigned char **output_ptr,
unsigned char *output_stop,
1459 unsigned char empty_buf;
1460 unsigned char *empty_ptr = &empty_buf;
1465 input_ptr = (
const unsigned char **)&empty_ptr;
1466 input_stop = empty_ptr;
1470 output_ptr = &empty_ptr;
1471 output_stop = empty_ptr;
1475 ret =
rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1524 static unsigned char *
1526 const unsigned char *str,
size_t len,
1527 unsigned char *caller_dst_buf,
size_t caller_dst_bufsize,
1528 size_t *dst_len_ptr)
1530 unsigned char *dst_str;
1537 const unsigned char *sp;
1541 dst_bufsize = caller_dst_bufsize;
1551 dst_str = caller_dst_buf;
1553 dst_str =
xmalloc(dst_bufsize);
1556 dp = dst_str+dst_len;
1558 dst_len = dp - dst_str;
1564 if (dst_str == caller_dst_buf) {
1567 memcpy(tmp, dst_str, dst_bufsize/2);
1571 dst_str =
xrealloc(dst_str, dst_bufsize);
1573 dp = dst_str+dst_len;
1575 dst_len = dp - dst_str;
1581 *dst_len_ptr = dst_len;
1585 if (dst_str != caller_dst_buf)
1594 const unsigned char *str,
size_t len,
const char *str_encoding)
1597 unsigned char insert_buf[4096];
1598 const unsigned char *insert_str =
NULL;
1601 int last_trans_index;
1604 unsigned char **buf_start_p;
1605 unsigned char **data_start_p;
1606 unsigned char **data_end_p;
1607 unsigned char **buf_end_p;
1622 str, len, insert_buf,
sizeof(insert_buf), &insert_len);
1623 if (insert_str ==
NULL)
1638 tc = ec->
elems[last_trans_index].
tc;
1640 if (need < insert_len)
1642 if (last_trans_index == 0) {
1662 tc = ec->
elems[last_trans_index].
tc;
1665 if (*buf_start_p ==
NULL) {
1668 *data_start_p =
buf;
1670 *buf_end_p = buf+need;
1672 else if ((
size_t)(*buf_end_p - *data_end_p) < need) {
1673 MEMMOVE(*buf_start_p, *data_start_p,
unsigned char, *data_end_p - *data_start_p);
1674 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1675 *data_start_p = *buf_start_p;
1676 if ((
size_t)(*buf_end_p - *data_end_p) < need) {
1678 size_t s = (*data_end_p - *buf_start_p) + need;
1682 *data_start_p =
buf;
1683 *data_end_p = buf + (*data_end_p - *buf_start_p);
1685 *buf_end_p = buf + s;
1689 memcpy(*data_end_p, insert_str, insert_len);
1690 *data_end_p += insert_len;
1697 if (insert_str != str && insert_str != insert_buf)
1698 xfree((
void*)insert_str);
1702 if (insert_str != str && insert_str != insert_buf)
1703 xfree((
void*)insert_str);
1752 #if SIZEOF_SIZE_T > SIZEOF_INT
1815 return data.ascii_compat_name;
1821 unsigned const char *ss, *sp, *se;
1822 unsigned char *ds, *
dp, *de;
1841 unsigned long new_capa = (
unsigned long)dlen + len + max_output;
1847 ss = sp = (
const unsigned char *)
RSTRING_PTR(src) + off;
1966 for (i = 0; i < num_trans; i++) {
1968 for (k = 0; k < n; k++)
1989 int has_description = 0;
1994 if (*sname !=
'\0' || *dname !=
'\0') {
1997 else if (*dname ==
'\0')
2001 has_description = 1;
2008 const char *pre =
"";
2009 if (has_description)
2035 has_description = 1;
2037 if (!has_description) {
2074 else if (readagain_len) {
2075 bytes2 =
rb_str_new(err+error_len, readagain_len);
2111 const char *start, *end;
2135 mesg =
rb_sprintf(
"%s to %s in conversion from %s",
2159 unsigned char *(*resize_destination)(
VALUE,
size_t,
size_t),
2161 unsigned char **out_start_ptr,
2162 unsigned char **out_pos,
2163 unsigned char **out_stop_ptr)
2165 size_t len = (*out_pos - *out_start_ptr);
2166 size_t new_len = (len + max_output) * 2;
2167 *out_start_ptr = resize_destination(destination, len, new_len);
2168 *out_pos = *out_start_ptr +
len;
2169 *out_stop_ptr = *out_start_ptr + new_len;
2178 const unsigned char *replacement;
2179 const char *repl_enc;
2180 const char *ins_enc;
2195 replacement = (
unsigned char *)
"?";
2209 const unsigned char *str,
size_t len,
const char *encname)
2211 unsigned char *str2;
2213 const char *encname2;
2219 MEMCPY(str2, str,
unsigned char, len);
2255 #define hash_fallback rb_hash_aref
2277 const unsigned char *in_stop,
unsigned char *out_stop,
2279 unsigned char *(*resize_destination)(
VALUE,
size_t,
size_t),
2280 const char *src_encoding,
2281 const char *dst_encoding,
2288 unsigned char *out_start = *out_pos;
2324 rep = (*fallback_func)(fallback, rep);
2329 if ((
int)ret == -1) {
2345 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2355 transcode_loop(
const unsigned char **in_pos,
unsigned char **out_pos,
2356 const unsigned char *in_stop,
unsigned char *out_stop,
2358 unsigned char *(*resize_destination)(
VALUE,
size_t,
size_t),
2359 const char *src_encoding,
2360 const char *dst_encoding,
2367 unsigned char *out_start = *out_pos;
2368 const unsigned char *ptr;
2382 unsigned char input_byte;
2383 const unsigned char *
p = &input_byte;
2386 if (ptr < in_stop) {
2397 if (&input_byte != p)
2398 ptr += p - &input_byte;
2409 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2430 static unsigned char *
2483 #ifdef ENABLE_ECONV_NEWLINE_OPTION
2510 int setflags = 0, newlineflag = 0;
2515 newlineflag |= !
NIL_P(v);
2520 newlineflag |= !
NIL_P(v);
2525 newlineflag |= !
NIL_P(v);
2529 ecflags |= setflags;
2542 if (
NIL_P(opthash)) {
2574 if (!
NIL_P(newhash))
2593 if (
NIL_P(opthash)) {
2598 rb_bug(
"rb_econv_open_opts called with invalid opthash");
2602 ec =
rb_econv_open(source_encoding, destination_encoding, ecflags);
2606 if (!
NIL_P(replacement)) {
2652 const char *sname, *dname;
2653 int sencidx, dencidx;
2655 dencidx =
enc_arg(arg1, &dname, &denc);
2663 sencidx =
enc_arg(arg2, &sname, &senc);
2678 volatile VALUE arg1, arg2;
2680 unsigned char *
buf, *
bp, *sp;
2681 const unsigned char *fromp;
2683 const char *sname, *dname;
2686 if (argc <0 || argc > 2) {
2693 if (!ecflags)
return -1;
2701 arg2 = argc<=1 ?
Qnil : argv[1];
2708 if (senc && senc == denc) {
2709 return NIL_P(arg2) ? -1 : dencidx;
2717 return NIL_P(arg2) ? -1 : dencidx;
2734 if (fromp != sp+slen) {
2807 if (encidx < 0)
return str;
2808 if (newstr == str) {
2892 int encidx =
str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2900 if (newstr == str) {
2982 const char *arg_name, *result_name;
2985 enc_arg(&arg, &arg_name, &arg_enc);
2989 if (result_name ==
NULL)
2999 volatile VALUE *snamev_p,
volatile VALUE *dnamev_p,
3000 const char **sname_p,
const char **dname_p,
3005 VALUE opt, flags_v, ecopts;
3007 const char *sname, *dname;
3011 argc =
rb_scan_args(argc, argv,
"21:", snamev_p, dnamev_p, &flags_v, &opt);
3013 if (!
NIL_P(flags_v)) {
3021 else if (!
NIL_P(opt)) {
3054 *ecflags_p = ecflags;
3067 if (num_decorators == -1)
3083 rb_ary_store(convpath, len + num_decorators - 1, pair);
3087 rb_ary_store(convpath, len + num_decorators - 1, pair);
3091 for (i = 0; i < num_decorators; i++)
3103 if (*ary_p ==
Qnil) {
3144 volatile VALUE snamev, dnamev;
3145 const char *sname, *dname;
3151 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3156 if (
NIL_P(convpath))
3176 return RTEST(convpath);
3202 const char **sname_p,
const char **dname_p,
3210 const char *sname, *dname;
3216 volatile VALUE snamev, dnamev;
3223 enc_arg(&snamev, &sname, &senc);
3225 enc_arg(&dnamev, &dname, &denc);
3243 if (ret == -1 || arg.
ret == -1)
3378 volatile VALUE snamev, dnamev;
3379 const char *sname, *dname;
3395 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3436 return rb_sprintf(
"#<%s: uninitialized>", cname);
3683 const unsigned char *ip, *is;
3684 unsigned char *op, *os;
3685 long output_byteoffset, output_bytesize;
3686 unsigned long output_byteend;
3689 argc =
rb_scan_args(argc, argv,
"23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3691 if (
NIL_P(output_byteoffset_v))
3692 output_byteoffset = 0;
3694 output_byteoffset =
NUM2LONG(output_byteoffset_v);
3696 if (
NIL_P(output_bytesize_v))
3697 output_bytesize = 0;
3699 output_bytesize =
NUM2LONG(output_bytesize_v);
3701 if (!
NIL_P(flags_v)) {
3708 else if (!
NIL_P(opt)) {
3727 if (
NIL_P(output_bytesize_v)) {
3735 if (
NIL_P(output_byteoffset_v))
3738 if (output_byteoffset < 0)
3744 if (output_bytesize < 0)
3747 output_byteend = (
unsigned long)output_byteoffset +
3748 (
unsigned long)output_bytesize;
3750 if (output_byteend < (
unsigned long)output_byteoffset ||
3765 op = (
unsigned char *)
RSTRING_PTR(output) + output_byteoffset;
3766 os = op + output_bytesize;
3774 if (
LONG_MAX / 2 < output_bytesize)
3776 output_bytesize *= 2;
3777 output_byteoffset_v =
Qnil;
3855 rb_bug(
"unexpected result of econv_primitive_convert");
3899 rb_bug(
"unexpected result of econv_primitive_convert");
4041 const char *insert_enc;
4098 if (putbackable < n)
4421 #ifdef ENABLE_ECONV_NEWLINE_OPTION