Ruby  1.9.3p448(2013-06-27revision41675)
re.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  re.c -
4 
5  $Author: akr $
6  created at: Mon Aug 9 18:24:49 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9 
10 **********************************************************************/
11 
12 #include "ruby/ruby.h"
13 #include "ruby/re.h"
14 #include "ruby/encoding.h"
15 #include "ruby/util.h"
16 #include "internal.h"
17 #include "regint.h"
18 #include <ctype.h>
19 
21 
23 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
24 
25 #define BEG(no) (regs->beg[(no)])
26 #define END(no) (regs->end[(no)])
27 
28 #if 'a' == 97 /* it's ascii */
29 static const char casetable[] = {
30  '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
31  '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
32  '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
33  '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
34  /* ' ' '!' '"' '#' '$' '%' '&' ''' */
35  '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
36  /* '(' ')' '*' '+' ',' '-' '.' '/' */
37  '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
38  /* '0' '1' '2' '3' '4' '5' '6' '7' */
39  '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
40  /* '8' '9' ':' ';' '<' '=' '>' '?' */
41  '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
42  /* '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' */
43  '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
44  /* 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */
45  '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
46  /* 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' */
47  '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
48  /* 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */
49  '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
50  /* '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' */
51  '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
52  /* 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */
53  '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
54  /* 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' */
55  '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
56  /* 'x' 'y' 'z' '{' '|' '}' '~' */
57  '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
58  '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
59  '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
60  '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
61  '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
62  '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
63  '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
64  '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
65  '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
66  '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
67  '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
68  '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
69  '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
70  '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
71  '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
72  '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
73  '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
74 };
75 #else
76 # error >>> "You lose. You will need a translation table for your character set." <<<
77 #endif
78 
79 int
80 rb_memcicmp(const void *x, const void *y, long len)
81 {
82  const unsigned char *p1 = x, *p2 = y;
83  int tmp;
84 
85  while (len--) {
86  if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
87  return tmp;
88  }
89  return 0;
90 }
91 
92 #undef rb_memcmp
93 
94 int
95 rb_memcmp(const void *p1, const void *p2, long len)
96 {
97  return memcmp(p1, p2, len);
98 }
99 
100 static inline long
101 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
102 {
103  const unsigned char *x = xs, *xe = xs + m;
104  const unsigned char *y = ys, *ye = ys + n;
105 #ifndef VALUE_MAX
106 # if SIZEOF_VALUE == 8
107 # define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
108 # elif SIZEOF_VALUE == 4
109 # define VALUE_MAX 0xFFFFFFFFUL
110 # endif
111 #endif
112  VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
113 
114  if (m > SIZEOF_VALUE)
115  rb_bug("!!too long pattern string!!");
116 
117  /* Prepare hash value */
118  for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
119  hx <<= CHAR_BIT;
120  hy <<= CHAR_BIT;
121  hx |= *x;
122  hy |= *y;
123  }
124  /* Searching */
125  while (hx != hy) {
126  if (y == ye)
127  return -1;
128  hy <<= CHAR_BIT;
129  hy |= *y;
130  hy &= mask;
131  y++;
132  }
133  return y - ys - m;
134 }
135 
136 static inline long
137 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
138 {
139  const unsigned char *x = xs, *xe = xs + m;
140  const unsigned char *y = ys;
141  VALUE i, qstable[256];
142 
143  /* Preprocessing */
144  for (i = 0; i < 256; ++i)
145  qstable[i] = m + 1;
146  for (; x < xe; ++x)
147  qstable[*x] = xe - x;
148  /* Searching */
149  for (; y + m <= ys + n; y += *(qstable + y[m])) {
150  if (*xs == *y && memcmp(xs, y, m) == 0)
151  return y - ys;
152  }
153  return -1;
154 }
155 
156 static inline unsigned int
157 rb_memsearch_qs_utf8_hash(const unsigned char *x)
158 {
159  register const unsigned int mix = 8353;
160  register unsigned int h = *x;
161  if (h < 0xC0) {
162  return h + 256;
163  }
164  else if (h < 0xE0) {
165  h *= mix;
166  h += x[1];
167  }
168  else if (h < 0xF0) {
169  h *= mix;
170  h += x[1];
171  h *= mix;
172  h += x[2];
173  }
174  else if (h < 0xF5) {
175  h *= mix;
176  h += x[1];
177  h *= mix;
178  h += x[2];
179  h *= mix;
180  h += x[3];
181  }
182  else {
183  return h + 256;
184  }
185  return (unsigned char)h;
186 }
187 
188 static inline long
189 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
190 {
191  const unsigned char *x = xs, *xe = xs + m;
192  const unsigned char *y = ys;
193  VALUE i, qstable[512];
194 
195  /* Preprocessing */
196  for (i = 0; i < 512; ++i) {
197  qstable[i] = m + 1;
198  }
199  for (; x < xe; ++x) {
200  qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
201  }
202  /* Searching */
203  for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
204  if (*xs == *y && memcmp(xs, y, m) == 0)
205  return y - ys;
206  }
207  return -1;
208 }
209 
210 long
211 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
212 {
213  const unsigned char *x = x0, *y = y0;
214 
215  if (m > n) return -1;
216  else if (m == n) {
217  return memcmp(x0, y0, m) == 0 ? 0 : -1;
218  }
219  else if (m < 1) {
220  return 0;
221  }
222  else if (m == 1) {
223  const unsigned char *ys = y, *ye = ys + n;
224  for (; y < ye; ++y) {
225  if (*x == *y)
226  return y - ys;
227  }
228  return -1;
229  }
230  else if (m <= SIZEOF_VALUE) {
231  return rb_memsearch_ss(x0, m, y0, n);
232  }
233  else if (enc == rb_utf8_encoding()){
234  return rb_memsearch_qs_utf8(x0, m, y0, n);
235  }
236  else {
237  return rb_memsearch_qs(x0, m, y0, n);
238  }
239 }
240 
241 #define REG_LITERAL FL_USER5
242 #define REG_ENCODING_NONE FL_USER6
243 
244 #define KCODE_FIXED FL_USER4
245 
246 #define ARG_REG_OPTION_MASK \
247  (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
248 #define ARG_ENCODING_FIXED 16
249 #define ARG_ENCODING_NONE 32
250 
251 static int
253 {
254  int val;
255 
256  switch (c) {
257  case 'i':
259  break;
260  case 'x':
261  val = ONIG_OPTION_EXTEND;
262  break;
263  case 'm':
264  val = ONIG_OPTION_MULTILINE;
265  break;
266  default:
267  val = 0;
268  break;
269  }
270  return val;
271 }
272 
273 static char *
274 option_to_str(char str[4], int options)
275 {
276  char *p = str;
277  if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
278  if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
279  if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
280  *p = 0;
281  return str;
282 }
283 
284 extern int
285 rb_char_to_option_kcode(int c, int *option, int *kcode)
286 {
287  *option = 0;
288 
289  switch (c) {
290  case 'n':
291  *kcode = rb_ascii8bit_encindex();
292  return (*option = ARG_ENCODING_NONE);
293  case 'e':
294  *kcode = rb_enc_find_index("EUC-JP");
295  break;
296  case 's':
297  *kcode = rb_enc_find_index("Windows-31J");
298  break;
299  case 'u':
300  *kcode = rb_utf8_encindex();
301  break;
302  default:
303  *kcode = -1;
304  return (*option = char_to_option(c));
305  }
306  *option = ARG_ENCODING_FIXED;
307  return 1;
308 }
309 
310 static void
312 {
313  if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
314  rb_raise(rb_eTypeError, "uninitialized Regexp");
315  }
316 }
317 
318 static void
319 rb_reg_expr_str(VALUE str, const char *s, long len,
320  rb_encoding *enc, rb_encoding *resenc)
321 {
322  const char *p, *pend;
323  int cr = ENC_CODERANGE_UNKNOWN;
324  int need_escape = 0;
325  int c, clen;
326 
327  p = s; pend = p + len;
328  rb_str_coderange_scan_restartable(p, pend, enc, &cr);
329  if (rb_enc_asciicompat(enc) &&
330  (cr == ENC_CODERANGE_VALID || cr == ENC_CODERANGE_7BIT)) {
331  while (p < pend) {
332  c = rb_enc_ascget(p, pend, &clen, enc);
333  if (c == -1) {
334  if (enc == resenc) {
335  p += mbclen(p, pend, enc);
336  }
337  else {
338  need_escape = 1;
339  break;
340  }
341  }
342  else if (c != '/' && rb_enc_isprint(c, enc)) {
343  p += clen;
344  }
345  else {
346  need_escape = 1;
347  break;
348  }
349  }
350  }
351  else {
352  need_escape = 1;
353  }
354 
355  if (!need_escape) {
356  rb_str_buf_cat(str, s, len);
357  }
358  else {
359  int unicode_p = rb_enc_unicode_p(enc);
360  p = s;
361  while (p<pend) {
362  c = rb_enc_ascget(p, pend, &clen, enc);
363  if (c == '\\' && p+clen < pend) {
364  int n = clen + mbclen(p+clen, pend, enc);
365  rb_str_buf_cat(str, p, n);
366  p += n;
367  continue;
368  }
369  else if (c == '/') {
370  char c = '\\';
371  rb_str_buf_cat(str, &c, 1);
372  rb_str_buf_cat(str, p, clen);
373  }
374  else if (c == -1) {
375  clen = rb_enc_precise_mbclen(p, pend, enc);
376  if (!MBCLEN_CHARFOUND_P(clen)) {
377  c = (unsigned char)*p;
378  clen = 1;
379  goto hex;
380  }
381  if (resenc) {
382  unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
383  rb_str_buf_cat_escaped_char(str, c, unicode_p);
384  }
385  else {
386  clen = MBCLEN_CHARFOUND_LEN(clen);
387  rb_str_buf_cat(str, p, clen);
388  }
389  }
390  else if (rb_enc_isprint(c, enc)) {
391  rb_str_buf_cat(str, p, clen);
392  }
393  else if (!rb_enc_isspace(c, enc)) {
394  char b[8];
395 
396  hex:
397  snprintf(b, sizeof(b), "\\x%02X", c);
398  rb_str_buf_cat(str, b, 4);
399  }
400  else {
401  rb_str_buf_cat(str, p, clen);
402  }
403  p += clen;
404  }
405  }
406 }
407 
408 static VALUE
409 rb_reg_desc(const char *s, long len, VALUE re)
410 {
411  rb_encoding *enc = rb_enc_get(re);
412  VALUE str = rb_str_buf_new2("/");
414  if (resenc == NULL) resenc = rb_default_external_encoding();
415 
416  if (re && rb_enc_asciicompat(enc)) {
417  rb_enc_copy(str, re);
418  }
419  else {
421  }
422  rb_reg_expr_str(str, s, len, enc, resenc);
423  rb_str_buf_cat2(str, "/");
424  if (re) {
425  char opts[4];
426  rb_reg_check(re);
427  if (*option_to_str(opts, RREGEXP(re)->ptr->options))
428  rb_str_buf_cat2(str, opts);
429  if (RBASIC(re)->flags & REG_ENCODING_NONE)
430  rb_str_buf_cat2(str, "n");
431  }
432  OBJ_INFECT(str, re);
433  return str;
434 }
435 
436 
437 /*
438  * call-seq:
439  * rxp.source -> str
440  *
441  * Returns the original string of the pattern.
442  *
443  * /ab+c/ix.source #=> "ab+c"
444  *
445  * Note that escape sequences are retained as is.
446  *
447  * /\x20\+/.source #=> "\\x20\\+"
448  *
449  */
450 
451 static VALUE
453 {
454  VALUE str;
455 
456  rb_reg_check(re);
458  if (OBJ_TAINTED(re)) OBJ_TAINT(str);
459  return str;
460 }
461 
462 /*
463  * call-seq:
464  * rxp.inspect -> string
465  *
466  * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
467  * <code>#inspect</code> actually produces the more natural version of
468  * the string than <code>#to_s</code>.
469  *
470  * /ab+c/ix.inspect #=> "/ab+c/ix"
471  *
472  */
473 
474 static VALUE
476 {
477  if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
478  return rb_any_to_s(re);
479  }
480  return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
481 }
482 
483 
484 /*
485  * call-seq:
486  * rxp.to_s -> str
487  *
488  * Returns a string containing the regular expression and its options (using the
489  * <code>(?opts:source)</code> notation. This string can be fed back in to
490  * <code>Regexp::new</code> to a regular expression with the same semantics as
491  * the original. (However, <code>Regexp#==</code> may not return true when
492  * comparing the two, as the source of the regular expression itself may
493  * differ, as the example shows). <code>Regexp#inspect</code> produces a
494  * generally more readable version of <i>rxp</i>.
495  *
496  * r1 = /ab+c/ix #=> /ab+c/ix
497  * s1 = r1.to_s #=> "(?ix-m:ab+c)"
498  * r2 = Regexp.new(s1) #=> /(?ix-m:ab+c)/
499  * r1 == r2 #=> false
500  * r1.source #=> "ab+c"
501  * r2.source #=> "(?ix-m:ab+c)"
502  */
503 
504 static VALUE
506 {
507  int options, opt;
509  long len;
510  const UChar* ptr;
511  VALUE str = rb_str_buf_new2("(?");
512  char optbuf[5];
513  rb_encoding *enc = rb_enc_get(re);
514 
515  rb_reg_check(re);
516 
517  rb_enc_copy(str, re);
518  options = RREGEXP(re)->ptr->options;
519  ptr = (UChar*)RREGEXP_SRC_PTR(re);
520  len = RREGEXP_SRC_LEN(re);
521  again:
522  if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
523  int err = 1;
524  ptr += 2;
525  if ((len -= 2) > 0) {
526  do {
527  opt = char_to_option((int )*ptr);
528  if (opt != 0) {
529  options |= opt;
530  }
531  else {
532  break;
533  }
534  ++ptr;
535  } while (--len > 0);
536  }
537  if (len > 1 && *ptr == '-') {
538  ++ptr;
539  --len;
540  do {
541  opt = char_to_option((int )*ptr);
542  if (opt != 0) {
543  options &= ~opt;
544  }
545  else {
546  break;
547  }
548  ++ptr;
549  } while (--len > 0);
550  }
551  if (*ptr == ')') {
552  --len;
553  ++ptr;
554  goto again;
555  }
556  if (*ptr == ':' && ptr[len-1] == ')') {
557  Regexp *rp;
558 
559  ++ptr;
560  len -= 2;
561  err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT,
562  enc, OnigDefaultSyntax, NULL);
563  onig_free(rp);
564  }
565  if (err) {
566  options = RREGEXP(re)->ptr->options;
567  ptr = (UChar*)RREGEXP_SRC_PTR(re);
568  len = RREGEXP_SRC_LEN(re);
569  }
570  }
571 
572  if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
573 
574  if ((options & embeddable) != embeddable) {
575  optbuf[0] = '-';
576  option_to_str(optbuf + 1, ~options);
577  rb_str_buf_cat2(str, optbuf);
578  }
579 
580  rb_str_buf_cat2(str, ":");
581  rb_reg_expr_str(str, (char*)ptr, len, enc, NULL);
582  rb_str_buf_cat2(str, ")");
583  rb_enc_copy(str, re);
584 
585  OBJ_INFECT(str, re);
586  return str;
587 }
588 
589 static void
590 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
591 {
592  volatile VALUE desc = rb_reg_desc(s, len, re);
593 
594  rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
595 }
596 
597 static VALUE
598 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
599 {
600  char opts[6];
601  VALUE desc = rb_str_buf_new2(err);
603  if (resenc == NULL) resenc = rb_default_external_encoding();
604 
605  rb_enc_associate(desc, enc);
606  rb_str_buf_cat2(desc, ": /");
607  rb_reg_expr_str(desc, s, len, enc, resenc);
608  opts[0] = '/';
609  option_to_str(opts + 1, options);
610  rb_str_buf_cat2(desc, opts);
611  return rb_exc_new3(rb_eRegexpError, desc);
612 }
613 
614 static void
615 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
616 {
617  rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
618 }
619 
620 static VALUE
621 rb_reg_error_desc(VALUE str, int options, const char *err)
622 {
624  rb_enc_get(str), options, err);
625 }
626 
627 static void
628 rb_reg_raise_str(VALUE str, int options, const char *err)
629 {
630  rb_exc_raise(rb_reg_error_desc(str, options, err));
631 }
632 
633 
634 /*
635  * call-seq:
636  * rxp.casefold? -> true or false
637  *
638  * Returns the value of the case-insensitive flag.
639  *
640  * /a/.casefold? #=> false
641  * /a/i.casefold? #=> true
642  * /(?i:a)/.casefold? #=> false
643  */
644 
645 static VALUE
647 {
648  rb_reg_check(re);
649  if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
650  return Qfalse;
651 }
652 
653 
654 /*
655  * call-seq:
656  * rxp.options -> fixnum
657  *
658  * Returns the set of bits corresponding to the options used when creating this
659  * Regexp (see <code>Regexp::new</code> for details. Note that additional bits
660  * may be set in the returned options: these are used internally by the regular
661  * expression code. These extra bits are ignored if the options are passed to
662  * <code>Regexp::new</code>.
663  *
664  * Regexp::IGNORECASE #=> 1
665  * Regexp::EXTENDED #=> 2
666  * Regexp::MULTILINE #=> 4
667  *
668  * /cat/.options #=> 0
669  * /cat/ix.options #=> 3
670  * Regexp.new('cat', true).options #=> 1
671  * /\xa1\xa2/e.options #=> 16
672  *
673  * r = /cat/ix
674  * Regexp.new(r.source, r.options) #=> /cat/ix
675  */
676 
677 static VALUE
679 {
680  int options = rb_reg_options(re);
681  return INT2NUM(options);
682 }
683 
684 static int
685 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
686  int back_num, int *back_refs, OnigRegex regex, void *arg)
687 {
688  VALUE ary = (VALUE)arg;
689  rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
690  return 0;
691 }
692 
693 /*
694  * call-seq:
695  * rxp.names -> [name1, name2, ...]
696  *
697  * Returns a list of names of captures as an array of strings.
698  *
699  * /(?<foo>.)(?<bar>.)(?<baz>.)/.names
700  * #=> ["foo", "bar", "baz"]
701  *
702  * /(?<foo>.)(?<foo>.)/.names
703  * #=> ["foo"]
704  *
705  * /(.)(.)/.names
706  * #=> []
707  */
708 
709 static VALUE
711 {
712  VALUE ary = rb_ary_new();
713  rb_reg_check(re);
714  onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
715  return ary;
716 }
717 
718 static int
720  int back_num, int *back_refs, OnigRegex regex, void *arg)
721 {
722  VALUE hash = (VALUE)arg;
723  VALUE ary = rb_ary_new2(back_num);
724  int i;
725 
726  for(i = 0; i < back_num; i++)
727  rb_ary_store(ary, i, INT2NUM(back_refs[i]));
728 
729  rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
730 
731  return 0;
732 }
733 
734 /*
735  * call-seq:
736  * rxp.named_captures -> hash
737  *
738  * Returns a hash representing information about named captures of <i>rxp</i>.
739  *
740  * A key of the hash is a name of the named captures.
741  * A value of the hash is an array which is list of indexes of corresponding
742  * named captures.
743  *
744  * /(?<foo>.)(?<bar>.)/.named_captures
745  * #=> {"foo"=>[1], "bar"=>[2]}
746  *
747  * /(?<foo>.)(?<foo>.)/.named_captures
748  * #=> {"foo"=>[1, 2]}
749  *
750  * If there are no named captures, an empty hash is returned.
751  *
752  * /(.)(.)/.named_captures
753  * #=> {}
754  */
755 
756 static VALUE
758 {
759  VALUE hash = rb_hash_new();
760  rb_reg_check(re);
761  onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
762  return hash;
763 }
764 
765 static int
766 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
767  OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
768  OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
769 {
770  int r;
771 
772  *reg = (regex_t* )xmalloc(sizeof(regex_t));
773  if (IS_NULL(*reg)) return ONIGERR_MEMORY;
774 
775  r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
776  if (r) goto err;
777 
778  r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
779  if (r) {
780  err:
781  onig_free(*reg);
782  *reg = NULL;
783  }
784  return r;
785 }
786 
787 static Regexp*
788 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
789  const char *sourcefile, int sourceline)
790 {
791  Regexp *rp;
792  int r;
793  OnigErrorInfo einfo;
794 
795  /* Handle escaped characters first. */
796 
797  /* Build a copy of the string (in dest) with the
798  escaped characters translated, and generate the regex
799  from that.
800  */
801 
802  r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
803  enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
804  if (r) {
805  onig_error_code_to_str((UChar*)err, r, &einfo);
806  return 0;
807  }
808  return rp;
809 }
810 
811 
812 /*
813  * Document-class: MatchData
814  *
815  * <code>MatchData</code> is the type of the special variable <code>$~</code>,
816  * and is the type of the object returned by <code>Regexp#match</code> and
817  * <code>Regexp.last_match</code>. It encapsulates all the results of a pattern
818  * match, results normally accessed through the special variables
819  * <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>,
820  * <code>$2</code>, and so on.
821  *
822  */
823 
825 
826 static VALUE
828 {
829  NEWOBJ(match, struct RMatch);
830  OBJSETUP(match, klass, T_MATCH);
831 
832  match->str = 0;
833  match->rmatch = 0;
834  match->regexp = 0;
835  match->rmatch = ALLOC(struct rmatch);
836  MEMZERO(match->rmatch, struct rmatch, 1);
837 
838  return (VALUE)match;
839 }
840 
841 typedef struct {
842  long byte_pos;
843  long char_pos;
844 } pair_t;
845 
846 static int
847 pair_byte_cmp(const void *pair1, const void *pair2)
848 {
849  long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
850 #if SIZEOF_LONG > SIZEOF_INT
851  return diff ? diff > 0 ? 1 : -1 : 0;
852 #else
853  return (int)diff;
854 #endif
855 }
856 
857 static void
859 {
860  struct rmatch *rm = RMATCH(match)->rmatch;
861  struct re_registers *regs;
862  int i, num_regs, num_pos;
863  long c;
864  char *s, *p, *q;
865  rb_encoding *enc;
866  pair_t *pairs;
867 
868  if (rm->char_offset_updated)
869  return;
870 
871  regs = &rm->regs;
872  num_regs = rm->regs.num_regs;
873 
874  if (rm->char_offset_num_allocated < num_regs) {
875  REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
877  }
878 
879  enc = rb_enc_get(RMATCH(match)->str);
880  if (rb_enc_mbmaxlen(enc) == 1) {
881  for (i = 0; i < num_regs; i++) {
882  rm->char_offset[i].beg = BEG(i);
883  rm->char_offset[i].end = END(i);
884  }
885  rm->char_offset_updated = 1;
886  return;
887  }
888 
889  pairs = ALLOCA_N(pair_t, num_regs*2);
890  num_pos = 0;
891  for (i = 0; i < num_regs; i++) {
892  if (BEG(i) < 0)
893  continue;
894  pairs[num_pos++].byte_pos = BEG(i);
895  pairs[num_pos++].byte_pos = END(i);
896  }
897  qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
898 
899  s = p = RSTRING_PTR(RMATCH(match)->str);
900  c = 0;
901  for (i = 0; i < num_pos; i++) {
902  q = s + pairs[i].byte_pos;
903  c += rb_enc_strlen(p, q, enc);
904  pairs[i].char_pos = c;
905  p = q;
906  }
907 
908  for (i = 0; i < num_regs; i++) {
909  pair_t key, *found;
910  if (BEG(i) < 0) {
911  rm->char_offset[i].beg = -1;
912  rm->char_offset[i].end = -1;
913  continue;
914  }
915 
916  key.byte_pos = BEG(i);
917  found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
918  rm->char_offset[i].beg = found->char_pos;
919 
920  key.byte_pos = END(i);
921  found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
922  rm->char_offset[i].end = found->char_pos;
923  }
924 
925  rm->char_offset_updated = 1;
926 }
927 
928 static void
930 {
931  if (!RMATCH(match)->regexp) {
932  rb_raise(rb_eTypeError, "uninitialized Match");
933  }
934 }
935 
936 /* :nodoc: */
937 static VALUE
939 {
940  struct rmatch *rm;
941 
942  if (obj == orig) return obj;
943 
944  if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
945  rb_raise(rb_eTypeError, "wrong argument class");
946  }
947  RMATCH(obj)->str = RMATCH(orig)->str;
948  RMATCH(obj)->regexp = RMATCH(orig)->regexp;
949 
950  rm = RMATCH(obj)->rmatch;
951  onig_region_copy(&rm->regs, RMATCH_REGS(orig));
952 
953  if (!RMATCH(orig)->rmatch->char_offset_updated) {
954  rm->char_offset_updated = 0;
955  }
956  else {
957  if (rm->char_offset_num_allocated < rm->regs.num_regs) {
958  REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
960  }
962  struct rmatch_offset, rm->regs.num_regs);
963  rm->char_offset_updated = 1;
964  }
965 
966  return obj;
967 }
968 
969 
970 /*
971  * call-seq:
972  * mtch.regexp -> regexp
973  *
974  * Returns the regexp.
975  *
976  * m = /a.*b/.match("abc")
977  * m.regexp #=> /a.*b/
978  */
979 
980 static VALUE
982 {
983  match_check(match);
984  return RMATCH(match)->regexp;
985 }
986 
987 /*
988  * call-seq:
989  * mtch.names -> [name1, name2, ...]
990  *
991  * Returns a list of names of captures as an array of strings.
992  * It is same as mtch.regexp.names.
993  *
994  * /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
995  * #=> ["foo", "bar", "baz"]
996  *
997  * m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
998  * m.names #=> ["x", "y"]
999  */
1000 
1001 static VALUE
1003 {
1004  match_check(match);
1005  return rb_reg_names(RMATCH(match)->regexp);
1006 }
1007 
1008 /*
1009  * call-seq:
1010  * mtch.length -> integer
1011  * mtch.size -> integer
1012  *
1013  * Returns the number of elements in the match array.
1014  *
1015  * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1016  * m.length #=> 5
1017  * m.size #=> 5
1018  */
1019 
1020 static VALUE
1022 {
1023  match_check(match);
1024  return INT2FIX(RMATCH_REGS(match)->num_regs);
1025 }
1026 
1027 static int
1029 {
1030  const char *name;
1031  int num;
1032 
1033  struct re_registers *regs = RMATCH_REGS(match);
1034  VALUE regexp = RMATCH(match)->regexp;
1035 
1036  match_check(match);
1037  switch(TYPE(backref)) {
1038  default:
1039  return NUM2INT(backref);
1040 
1041  case T_SYMBOL:
1042  name = rb_id2name(SYM2ID(backref));
1043  break;
1044 
1045  case T_STRING:
1046  name = StringValueCStr(backref);
1047  break;
1048  }
1049 
1050  num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
1051  (const unsigned char*)name,
1052  (const unsigned char*)name + strlen(name),
1053  regs);
1054 
1055  if (num < 1) {
1056  rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
1057  }
1058 
1059  return num;
1060 }
1061 
1062 int
1064 {
1065  return match_backref_number(match, backref);
1066 }
1067 
1068 /*
1069  * call-seq:
1070  * mtch.offset(n) -> array
1071  *
1072  * Returns a two-element array containing the beginning and ending offsets of
1073  * the <em>n</em>th match.
1074  * <em>n</em> can be a string or symbol to reference a named capture.
1075  *
1076  * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1077  * m.offset(0) #=> [1, 7]
1078  * m.offset(4) #=> [6, 7]
1079  *
1080  * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1081  * p m.offset(:foo) #=> [0, 1]
1082  * p m.offset(:bar) #=> [2, 3]
1083  *
1084  */
1085 
1086 static VALUE
1088 {
1089  int i = match_backref_number(match, n);
1090  struct re_registers *regs = RMATCH_REGS(match);
1091 
1092  match_check(match);
1093  if (i < 0 || regs->num_regs <= i)
1094  rb_raise(rb_eIndexError, "index %d out of matches", i);
1095 
1096  if (BEG(i) < 0)
1097  return rb_assoc_new(Qnil, Qnil);
1098 
1099  update_char_offset(match);
1100  return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
1101  INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
1102 }
1103 
1104 
1105 /*
1106  * call-seq:
1107  * mtch.begin(n) -> integer
1108  *
1109  * Returns the offset of the start of the <em>n</em>th element of the match
1110  * array in the string.
1111  * <em>n</em> can be a string or symbol to reference a named capture.
1112  *
1113  * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1114  * m.begin(0) #=> 1
1115  * m.begin(2) #=> 2
1116  *
1117  * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1118  * p m.begin(:foo) #=> 0
1119  * p m.begin(:bar) #=> 2
1120  */
1121 
1122 static VALUE
1124 {
1125  int i = match_backref_number(match, n);
1126  struct re_registers *regs = RMATCH_REGS(match);
1127 
1128  match_check(match);
1129  if (i < 0 || regs->num_regs <= i)
1130  rb_raise(rb_eIndexError, "index %d out of matches", i);
1131 
1132  if (BEG(i) < 0)
1133  return Qnil;
1134 
1135  update_char_offset(match);
1136  return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
1137 }
1138 
1139 
1140 /*
1141  * call-seq:
1142  * mtch.end(n) -> integer
1143  *
1144  * Returns the offset of the character immediately following the end of the
1145  * <em>n</em>th element of the match array in the string.
1146  * <em>n</em> can be a string or symbol to reference a named capture.
1147  *
1148  * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1149  * m.end(0) #=> 7
1150  * m.end(2) #=> 3
1151  *
1152  * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1153  * p m.end(:foo) #=> 1
1154  * p m.end(:bar) #=> 3
1155  */
1156 
1157 static VALUE
1159 {
1160  int i = match_backref_number(match, n);
1161  struct re_registers *regs = RMATCH_REGS(match);
1162 
1163  match_check(match);
1164  if (i < 0 || regs->num_regs <= i)
1165  rb_raise(rb_eIndexError, "index %d out of matches", i);
1166 
1167  if (BEG(i) < 0)
1168  return Qnil;
1169 
1170  update_char_offset(match);
1171  return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
1172 }
1173 
1174 #define MATCH_BUSY FL_USER2
1175 
1176 void
1178 {
1179  FL_SET(match, MATCH_BUSY);
1180 }
1181 
1182 /*
1183  * call-seq:
1184  * rxp.fixed_encoding? -> true or false
1185  *
1186  * Returns false if rxp is applicable to
1187  * a string with any ASCII compatible encoding.
1188  * Returns true otherwise.
1189  *
1190  * r = /a/
1191  * r.fixed_encoding? #=> false
1192  * r =~ "\u{6666} a" #=> 2
1193  * r =~ "\xa1\xa2 a".force_encoding("euc-jp") #=> 2
1194  * r =~ "abc".force_encoding("euc-jp") #=> 0
1195  *
1196  * r = /a/u
1197  * r.fixed_encoding? #=> true
1198  * r.encoding #=> #<Encoding:UTF-8>
1199  * r =~ "\u{6666} a" #=> 2
1200  * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError
1201  * r =~ "abc".force_encoding("euc-jp") #=> 0
1202  *
1203  * r = /\u{6666}/
1204  * r.fixed_encoding? #=> true
1205  * r.encoding #=> #<Encoding:UTF-8>
1206  * r =~ "\u{6666} a" #=> 0
1207  * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError
1208  * r =~ "abc".force_encoding("euc-jp") #=> nil
1209  */
1210 
1211 static VALUE
1213 {
1214  if (FL_TEST(re, KCODE_FIXED))
1215  return Qtrue;
1216  else
1217  return Qfalse;
1218 }
1219 
1220 static VALUE
1221 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
1222  rb_encoding **fixed_enc, onig_errmsg_buffer err);
1223 
1224 
1225 static void
1227 {
1229  "incompatible encoding regexp match (%s regexp with %s string)",
1230  rb_enc_name(rb_enc_get(re)),
1231  rb_enc_name(rb_enc_get(str)));
1232 }
1233 
1234 static rb_encoding*
1235 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
1236 {
1237  rb_encoding *enc = 0;
1238 
1241  "invalid byte sequence in %s",
1242  rb_enc_name(rb_enc_get(str)));
1243  }
1244 
1245  rb_reg_check(re);
1246  enc = rb_enc_get(str);
1247  if (!rb_enc_str_asciicompat_p(str)) {
1248  if (RREGEXP(re)->ptr->enc != enc) {
1249  reg_enc_error(re, str);
1250  }
1251  }
1252  else if (rb_reg_fixed_encoding_p(re)) {
1253  if (RREGEXP(re)->ptr->enc != enc &&
1254  (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
1256  reg_enc_error(re, str);
1257  }
1258  enc = RREGEXP(re)->ptr->enc;
1259  }
1260  if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
1261  enc != rb_ascii8bit_encoding() &&
1263  rb_warn("regexp match /.../n against to %s string",
1264  rb_enc_name(enc));
1265  }
1266  return enc;
1267 }
1268 
1269 regex_t *
1271 {
1272  regex_t *reg = RREGEXP(re)->ptr;
1273  onig_errmsg_buffer err = "";
1274  int r;
1275  OnigErrorInfo einfo;
1276  const char *pattern;
1277  VALUE unescaped;
1278  rb_encoding *fixed_enc = 0;
1279  rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
1280 
1281  if (reg->enc == enc) return reg;
1282 
1283  rb_reg_check(re);
1284  reg = RREGEXP(re)->ptr;
1285  pattern = RREGEXP_SRC_PTR(re);
1286 
1287  unescaped = rb_reg_preprocess(
1288  pattern, pattern + RREGEXP_SRC_LEN(re), enc,
1289  &fixed_enc, err);
1290 
1291  if (unescaped == Qnil) {
1292  rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
1293  }
1294 
1295  r = onig_new(&reg, (UChar* )RSTRING_PTR(unescaped),
1296  (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
1297  reg->options, enc,
1298  OnigDefaultSyntax, &einfo);
1299  if (r) {
1300  onig_error_code_to_str((UChar*)err, r, &einfo);
1301  rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
1302  }
1303 
1304  RB_GC_GUARD(unescaped);
1305  return reg;
1306 }
1307 
1308 long
1309 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
1310 {
1311  long range;
1312  rb_encoding *enc;
1313  UChar *p, *string;
1314 
1315  enc = rb_reg_prepare_enc(re, str, 0);
1316 
1317  if (reverse) {
1318  range = -pos;
1319  }
1320  else {
1321  range = RSTRING_LEN(str) - pos;
1322  }
1323 
1324  if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
1325  string = (UChar*)RSTRING_PTR(str);
1326 
1327  if (range > 0) {
1328  p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
1329  }
1330  else {
1331  p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
1332  }
1333  return p - string;
1334  }
1335 
1336  return pos;
1337 }
1338 
1339 long
1340 rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
1341 {
1342  long result;
1343  VALUE match;
1344  struct re_registers regi, *regs = &regi;
1345  char *range = RSTRING_PTR(str);
1346  regex_t *reg;
1347  int tmpreg;
1348 
1349  if (pos > RSTRING_LEN(str) || pos < 0) {
1351  return -1;
1352  }
1353 
1354  reg = rb_reg_prepare_re(re, str);
1355  tmpreg = reg != RREGEXP(re)->ptr;
1356  if (!tmpreg) RREGEXP(re)->usecnt++;
1357 
1358  match = rb_backref_get();
1359  if (!NIL_P(match)) {
1360  if (FL_TEST(match, MATCH_BUSY)) {
1361  match = Qnil;
1362  }
1363  else {
1364  regs = RMATCH_REGS(match);
1365  }
1366  }
1367  if (NIL_P(match)) {
1368  MEMZERO(regs, struct re_registers, 1);
1369  }
1370  if (!reverse) {
1371  range += RSTRING_LEN(str);
1372  }
1373  result = onig_search(reg,
1374  (UChar*)(RSTRING_PTR(str)),
1375  ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
1376  ((UChar*)(RSTRING_PTR(str)) + pos),
1377  ((UChar*)range),
1378  regs, ONIG_OPTION_NONE);
1379  if (!tmpreg) RREGEXP(re)->usecnt--;
1380  if (tmpreg) {
1381  if (RREGEXP(re)->usecnt) {
1382  onig_free(reg);
1383  }
1384  else {
1385  onig_free(RREGEXP(re)->ptr);
1386  RREGEXP(re)->ptr = reg;
1387  }
1388  }
1389  if (result < 0) {
1390  if (regs == &regi)
1391  onig_region_free(regs, 0);
1392  if (result == ONIG_MISMATCH) {
1394  return result;
1395  }
1396  else {
1397  onig_errmsg_buffer err = "";
1398  onig_error_code_to_str((UChar*)err, (int)result);
1399  rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
1400  }
1401  }
1402 
1403  if (NIL_P(match)) {
1404  match = match_alloc(rb_cMatch);
1405  onig_region_copy(RMATCH_REGS(match), regs);
1406  onig_region_free(regs, 0);
1407  }
1408  else {
1409  if (rb_safe_level() >= 3)
1410  OBJ_TAINT(match);
1411  else
1412  FL_UNSET(match, FL_TAINT);
1413  }
1414 
1415  RMATCH(match)->str = rb_str_new4(str);
1416  RMATCH(match)->regexp = re;
1417  RMATCH(match)->rmatch->char_offset_updated = 0;
1418  rb_backref_set(match);
1419 
1420  OBJ_INFECT(match, re);
1421  OBJ_INFECT(match, str);
1422 
1423  return result;
1424 }
1425 
1426 VALUE
1428 {
1429  struct re_registers *regs;
1430  if (NIL_P(match)) return Qnil;
1431  match_check(match);
1432  regs = RMATCH_REGS(match);
1433  if (nth >= regs->num_regs) {
1434  return Qnil;
1435  }
1436  if (nth < 0) {
1437  nth += regs->num_regs;
1438  if (nth <= 0) return Qnil;
1439  }
1440  if (BEG(nth) == -1) return Qfalse;
1441  return Qtrue;
1442 }
1443 
1444 VALUE
1446 {
1447  VALUE str;
1448  long start, end, len;
1449  struct re_registers *regs;
1450 
1451  if (NIL_P(match)) return Qnil;
1452  match_check(match);
1453  regs = RMATCH_REGS(match);
1454  if (nth >= regs->num_regs) {
1455  return Qnil;
1456  }
1457  if (nth < 0) {
1458  nth += regs->num_regs;
1459  if (nth <= 0) return Qnil;
1460  }
1461  start = BEG(nth);
1462  if (start == -1) return Qnil;
1463  end = END(nth);
1464  len = end - start;
1465  str = rb_str_subseq(RMATCH(match)->str, start, len);
1466  OBJ_INFECT(str, match);
1467  return str;
1468 }
1469 
1470 VALUE
1472 {
1473  return rb_reg_nth_match(0, match);
1474 }
1475 
1476 
1477 /*
1478  * call-seq:
1479  * mtch.pre_match -> str
1480  *
1481  * Returns the portion of the original string before the current match.
1482  * Equivalent to the special variable <code>$`</code>.
1483  *
1484  * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1485  * m.pre_match #=> "T"
1486  */
1487 
1488 VALUE
1490 {
1491  VALUE str;
1492  struct re_registers *regs;
1493 
1494  if (NIL_P(match)) return Qnil;
1495  match_check(match);
1496  regs = RMATCH_REGS(match);
1497  if (BEG(0) == -1) return Qnil;
1498  str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
1499  if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1500  return str;
1501 }
1502 
1503 
1504 /*
1505  * call-seq:
1506  * mtch.post_match -> str
1507  *
1508  * Returns the portion of the original string after the current match.
1509  * Equivalent to the special variable <code>$'</code>.
1510  *
1511  * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1512  * m.post_match #=> ": The Movie"
1513  */
1514 
1515 VALUE
1517 {
1518  VALUE str;
1519  long pos;
1520  struct re_registers *regs;
1521 
1522  if (NIL_P(match)) return Qnil;
1523  match_check(match);
1524  regs = RMATCH_REGS(match);
1525  if (BEG(0) == -1) return Qnil;
1526  str = RMATCH(match)->str;
1527  pos = END(0);
1528  str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
1529  if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1530  return str;
1531 }
1532 
1533 VALUE
1535 {
1536  int i;
1537  struct re_registers *regs;
1538 
1539  if (NIL_P(match)) return Qnil;
1540  match_check(match);
1541  regs = RMATCH_REGS(match);
1542  if (BEG(0) == -1) return Qnil;
1543 
1544  for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
1545  ;
1546  if (i == 0) return Qnil;
1547  return rb_reg_nth_match(i, match);
1548 }
1549 
1550 static VALUE
1552 {
1554 }
1555 
1556 static VALUE
1558 {
1559  return rb_reg_match_pre(rb_backref_get());
1560 }
1561 
1562 static VALUE
1564 {
1566 }
1567 
1568 static VALUE
1570 {
1572 }
1573 
1574 static VALUE
1576 {
1577  struct re_registers *regs;
1578  VALUE ary;
1579  VALUE target;
1580  int i;
1581  int taint = OBJ_TAINTED(match);
1582 
1583  match_check(match);
1584  regs = RMATCH_REGS(match);
1585  ary = rb_ary_new2(regs->num_regs);
1586  target = RMATCH(match)->str;
1587 
1588  for (i=start; i<regs->num_regs; i++) {
1589  if (regs->beg[i] == -1) {
1590  rb_ary_push(ary, Qnil);
1591  }
1592  else {
1593  VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
1594  if (taint) OBJ_TAINT(str);
1595  rb_ary_push(ary, str);
1596  }
1597  }
1598  return ary;
1599 }
1600 
1601 
1602 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the
1603  second example to prevent the '*' followed by a '/' from ending the
1604  comment. */
1605 
1606 /*
1607  * call-seq:
1608  * mtch.to_a -> anArray
1609  *
1610  * Returns the array of matches.
1611  *
1612  * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1613  * m.to_a #=> ["HX1138", "H", "X", "113", "8"]
1614  *
1615  * Because <code>to_a</code> is called when expanding
1616  * <code>*</code><em>variable</em>, there's a useful assignment
1617  * shortcut for extracting matched fields. This is slightly slower than
1618  * accessing the fields directly (as an intermediate array is
1619  * generated).
1620  *
1621  * all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138."))
1622  * all #=> "HX1138"
1623  * f1 #=> "H"
1624  * f2 #=> "X"
1625  * f3 #=> "113"
1626  */
1627 
1628 static VALUE
1630 {
1631  return match_array(match, 0);
1632 }
1633 
1634 
1635 /*
1636  * call-seq:
1637  * mtch.captures -> array
1638  *
1639  * Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
1640  *
1641  * f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
1642  * f1 #=> "H"
1643  * f2 #=> "X"
1644  * f3 #=> "113"
1645  * f4 #=> "8"
1646  */
1647 static VALUE
1649 {
1650  return match_array(match, 1);
1651 }
1652 
1653 static int
1654 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
1655 {
1656  int num;
1657 
1658  num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
1659  (const unsigned char* )name, (const unsigned char* )name_end, regs);
1660  if (num >= 1) {
1661  return num;
1662  }
1663  else {
1664  VALUE s = rb_str_new(name, (long )(name_end - name));
1665  rb_raise(rb_eIndexError, "undefined group name reference: %s",
1666  StringValuePtr(s));
1667  }
1668 }
1669 
1670 /*
1671  * call-seq:
1672  * mtch[i] -> str or nil
1673  * mtch[start, length] -> array
1674  * mtch[range] -> array
1675  * mtch[name] -> str or nil
1676  *
1677  * Match Reference---<code>MatchData</code> acts as an array, and may be
1678  * accessed using the normal array indexing techniques. <i>mtch</i>[0] is
1679  * equivalent to the special variable <code>$&</code>, and returns the entire
1680  * matched string. <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
1681  * of the matched backreferences (portions of the pattern between parentheses).
1682  *
1683  * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1684  * m #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
1685  * m[0] #=> "HX1138"
1686  * m[1, 2] #=> ["H", "X"]
1687  * m[1..3] #=> ["H", "X", "113"]
1688  * m[-3, 2] #=> ["X", "113"]
1689  *
1690  * m = /(?<foo>a+)b/.match("ccaaab")
1691  * m #=> #<MatchData "aaab" foo:"aaa">
1692  * m["foo"] #=> "aaa"
1693  * m[:foo] #=> "aaa"
1694  */
1695 
1696 static VALUE
1698 {
1699  VALUE idx, rest;
1700 
1701  match_check(match);
1702  rb_scan_args(argc, argv, "11", &idx, &rest);
1703 
1704  if (NIL_P(rest)) {
1705  if (FIXNUM_P(idx)) {
1706  if (FIX2INT(idx) >= 0) {
1707  return rb_reg_nth_match(FIX2INT(idx), match);
1708  }
1709  }
1710  else {
1711  const char *p;
1712  int num;
1713 
1714  switch (TYPE(idx)) {
1715  case T_SYMBOL:
1716  p = rb_id2name(SYM2ID(idx));
1717  goto name_to_backref;
1718  break;
1719  case T_STRING:
1720  p = StringValuePtr(idx);
1721 
1722  name_to_backref:
1723  num = name_to_backref_number(RMATCH_REGS(match),
1724  RMATCH(match)->regexp, p, p + strlen(p));
1725  return rb_reg_nth_match(num, match);
1726  break;
1727 
1728  default:
1729  break;
1730  }
1731  }
1732  }
1733 
1734  return rb_ary_aref(argc, argv, match_to_a(match));
1735 }
1736 
1737 static VALUE
1739 {
1740  /* n should not exceed num_regs */
1741  return rb_reg_nth_match((int)n, match);
1742 }
1743 
1744 
1745 /*
1746  * call-seq:
1747  *
1748  * mtch.values_at([index]*) -> array
1749  *
1750  * Uses each <i>index</i> to access the matching values, returning an array of
1751  * the corresponding matches.
1752  *
1753  * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1754  * m.to_a #=> ["HX1138", "H", "X", "113", "8"]
1755  * m.values_at(0, 2, -2) #=> ["HX1138", "X", "113"]
1756  */
1757 
1758 static VALUE
1760 {
1761  struct re_registers *regs;
1762 
1763  match_check(match);
1764  regs = RMATCH_REGS(match);
1765  return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
1766 }
1767 
1768 
1769 /*
1770  * call-seq:
1771  * mtch.to_s -> str
1772  *
1773  * Returns the entire matched string.
1774  *
1775  * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1776  * m.to_s #=> "HX1138"
1777  */
1778 
1779 static VALUE
1781 {
1782  VALUE str = rb_reg_last_match(match);
1783 
1784  match_check(match);
1785  if (NIL_P(str)) str = rb_str_new(0,0);
1786  if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1787  if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
1788  return str;
1789 }
1790 
1791 
1792 /*
1793  * call-seq:
1794  * mtch.string -> str
1795  *
1796  * Returns a frozen copy of the string passed in to <code>match</code>.
1797  *
1798  * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1799  * m.string #=> "THX1138."
1800  */
1801 
1802 static VALUE
1804 {
1805  match_check(match);
1806  return RMATCH(match)->str; /* str is frozen */
1807 }
1808 
1810  const UChar *name;
1811  long len;
1812 };
1813 
1814 static int
1816  int back_num, int *back_refs, OnigRegex regex, void *arg0)
1817 {
1818  struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
1819  int i;
1820 
1821  for (i = 0; i < back_num; i++) {
1822  arg[back_refs[i]].name = name;
1823  arg[back_refs[i]].len = name_end - name;
1824  }
1825  return 0;
1826 }
1827 
1828 /*
1829  * call-seq:
1830  * mtch.inspect -> str
1831  *
1832  * Returns a printable version of <i>mtch</i>.
1833  *
1834  * puts /.$/.match("foo").inspect
1835  * #=> #<MatchData "o">
1836  *
1837  * puts /(.)(.)(.)/.match("foo").inspect
1838  * #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
1839  *
1840  * puts /(.)(.)?(.)/.match("fo").inspect
1841  * #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
1842  *
1843  * puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
1844  * #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
1845  *
1846  */
1847 
1848 static VALUE
1850 {
1851  const char *cname = rb_obj_classname(match);
1852  VALUE str;
1853  int i;
1854  struct re_registers *regs = RMATCH_REGS(match);
1855  int num_regs = regs->num_regs;
1856  struct backref_name_tag *names;
1857  VALUE regexp = RMATCH(match)->regexp;
1858 
1859  if (regexp == 0) {
1860  return rb_sprintf("#<%s:%p>", cname, (void*)match);
1861  }
1862 
1863  names = ALLOCA_N(struct backref_name_tag, num_regs);
1864  MEMZERO(names, struct backref_name_tag, num_regs);
1865 
1866  onig_foreach_name(RREGEXP(regexp)->ptr,
1867  match_inspect_name_iter, names);
1868 
1869  str = rb_str_buf_new2("#<");
1870  rb_str_buf_cat2(str, cname);
1871 
1872  for (i = 0; i < num_regs; i++) {
1873  VALUE v;
1874  rb_str_buf_cat2(str, " ");
1875  if (0 < i) {
1876  if (names[i].name)
1877  rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
1878  else {
1879  rb_str_catf(str, "%d", i);
1880  }
1881  rb_str_buf_cat2(str, ":");
1882  }
1883  v = rb_reg_nth_match(i, match);
1884  if (v == Qnil)
1885  rb_str_buf_cat2(str, "nil");
1886  else
1888  }
1889  rb_str_buf_cat2(str, ">");
1890 
1891  return str;
1892 }
1893 
1895 
1896 static int
1897 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
1898 {
1899  const char *p = *pp;
1900  int code;
1901  int meta_prefix = 0, ctrl_prefix = 0;
1902  size_t len;
1903 
1904  if (p == end || *p++ != '\\') {
1905  errcpy(err, "too short escaped multibyte character");
1906  return -1;
1907  }
1908 
1909 again:
1910  if (p == end) {
1911  errcpy(err, "too short escape sequence");
1912  return -1;
1913  }
1914  switch (*p++) {
1915  case '\\': code = '\\'; break;
1916  case 'n': code = '\n'; break;
1917  case 't': code = '\t'; break;
1918  case 'r': code = '\r'; break;
1919  case 'f': code = '\f'; break;
1920  case 'v': code = '\013'; break;
1921  case 'a': code = '\007'; break;
1922  case 'e': code = '\033'; break;
1923 
1924  /* \OOO */
1925  case '0': case '1': case '2': case '3':
1926  case '4': case '5': case '6': case '7':
1927  p--;
1928  code = scan_oct(p, end < p+3 ? end-p : 3, &len);
1929  p += len;
1930  break;
1931 
1932  case 'x': /* \xHH */
1933  code = scan_hex(p, end < p+2 ? end-p : 2, &len);
1934  if (len < 1) {
1935  errcpy(err, "invalid hex escape");
1936  return -1;
1937  }
1938  p += len;
1939  break;
1940 
1941  case 'M': /* \M-X, \M-\C-X, \M-\cX */
1942  if (meta_prefix) {
1943  errcpy(err, "duplicate meta escape");
1944  return -1;
1945  }
1946  meta_prefix = 1;
1947  if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
1948  if (*p == '\\') {
1949  p++;
1950  goto again;
1951  }
1952  else {
1953  code = *p++;
1954  break;
1955  }
1956  }
1957  errcpy(err, "too short meta escape");
1958  return -1;
1959 
1960  case 'C': /* \C-X, \C-\M-X */
1961  if (p == end || *p++ != '-') {
1962  errcpy(err, "too short control escape");
1963  return -1;
1964  }
1965  case 'c': /* \cX, \c\M-X */
1966  if (ctrl_prefix) {
1967  errcpy(err, "duplicate control escape");
1968  return -1;
1969  }
1970  ctrl_prefix = 1;
1971  if (p < end && (*p & 0x80) == 0) {
1972  if (*p == '\\') {
1973  p++;
1974  goto again;
1975  }
1976  else {
1977  code = *p++;
1978  break;
1979  }
1980  }
1981  errcpy(err, "too short control escape");
1982  return -1;
1983 
1984  default:
1985  errcpy(err, "unexpected escape sequence");
1986  return -1;
1987  }
1988  if (code < 0 || 0xff < code) {
1989  errcpy(err, "invalid escape code");
1990  return -1;
1991  }
1992 
1993  if (ctrl_prefix)
1994  code &= 0x1f;
1995  if (meta_prefix)
1996  code |= 0x80;
1997 
1998  *pp = p;
1999  return code;
2000 }
2001 
2002 static int
2003 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
2005 {
2006  const char *p = *pp;
2007  int chmaxlen = rb_enc_mbmaxlen(enc);
2008  char *chbuf = ALLOCA_N(char, chmaxlen);
2009  int chlen = 0;
2010  int byte;
2011  int l;
2012 
2013  memset(chbuf, 0, chmaxlen);
2014 
2015  byte = read_escaped_byte(&p, end, err);
2016  if (byte == -1) {
2017  return -1;
2018  }
2019 
2020  chbuf[chlen++] = byte;
2021  while (chlen < chmaxlen &&
2022  MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
2023  byte = read_escaped_byte(&p, end, err);
2024  if (byte == -1) {
2025  return -1;
2026  }
2027  chbuf[chlen++] = byte;
2028  }
2029 
2030  l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
2031  if (MBCLEN_INVALID_P(l)) {
2032  errcpy(err, "invalid multibyte escape");
2033  return -1;
2034  }
2035  if (1 < chlen || (chbuf[0] & 0x80)) {
2036  rb_str_buf_cat(buf, chbuf, chlen);
2037 
2038  if (*encp == 0)
2039  *encp = enc;
2040  else if (*encp != enc) {
2041  errcpy(err, "escaped non ASCII character in UTF-8 regexp");
2042  return -1;
2043  }
2044  }
2045  else {
2046  char escbuf[5];
2047  snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
2048  rb_str_buf_cat(buf, escbuf, 4);
2049  }
2050  *pp = p;
2051  return 0;
2052 }
2053 
2054 static int
2056 {
2057  if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */
2058  0x10ffff < code) {
2059  errcpy(err, "invalid Unicode range");
2060  return -1;
2061  }
2062  return 0;
2063 }
2064 
2065 static int
2066 append_utf8(unsigned long uv,
2068 {
2069  if (check_unicode_range(uv, err) != 0)
2070  return -1;
2071  if (uv < 0x80) {
2072  char escbuf[5];
2073  snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
2074  rb_str_buf_cat(buf, escbuf, 4);
2075  }
2076  else {
2077  int len;
2078  char utf8buf[6];
2079  len = rb_uv_to_utf8(utf8buf, uv);
2080  rb_str_buf_cat(buf, utf8buf, len);
2081 
2082  if (*encp == 0)
2083  *encp = rb_utf8_encoding();
2084  else if (*encp != rb_utf8_encoding()) {
2085  errcpy(err, "UTF-8 character in non UTF-8 regexp");
2086  return -1;
2087  }
2088  }
2089  return 0;
2090 }
2091 
2092 static int
2093 unescape_unicode_list(const char **pp, const char *end,
2095 {
2096  const char *p = *pp;
2097  int has_unicode = 0;
2098  unsigned long code;
2099  size_t len;
2100 
2101  while (p < end && ISSPACE(*p)) p++;
2102 
2103  while (1) {
2104  code = ruby_scan_hex(p, end-p, &len);
2105  if (len == 0)
2106  break;
2107  if (6 < len) { /* max 10FFFF */
2108  errcpy(err, "invalid Unicode range");
2109  return -1;
2110  }
2111  p += len;
2112  if (append_utf8(code, buf, encp, err) != 0)
2113  return -1;
2114  has_unicode = 1;
2115 
2116  while (p < end && ISSPACE(*p)) p++;
2117  }
2118 
2119  if (has_unicode == 0) {
2120  errcpy(err, "invalid Unicode list");
2121  return -1;
2122  }
2123 
2124  *pp = p;
2125 
2126  return 0;
2127 }
2128 
2129 static int
2130 unescape_unicode_bmp(const char **pp, const char *end,
2132 {
2133  const char *p = *pp;
2134  size_t len;
2135  unsigned long code;
2136 
2137  if (end < p+4) {
2138  errcpy(err, "invalid Unicode escape");
2139  return -1;
2140  }
2141  code = ruby_scan_hex(p, 4, &len);
2142  if (len != 4) {
2143  errcpy(err, "invalid Unicode escape");
2144  return -1;
2145  }
2146  if (append_utf8(code, buf, encp, err) != 0)
2147  return -1;
2148  *pp = p + 4;
2149  return 0;
2150 }
2151 
2152 static int
2153 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
2154  VALUE buf, rb_encoding **encp, int *has_property,
2156 {
2157  char c;
2158  char smallbuf[2];
2159 
2160  while (p < end) {
2161  int chlen = rb_enc_precise_mbclen(p, end, enc);
2162  if (!MBCLEN_CHARFOUND_P(chlen)) {
2163  errcpy(err, "invalid multibyte character");
2164  return -1;
2165  }
2166  chlen = MBCLEN_CHARFOUND_LEN(chlen);
2167  if (1 < chlen || (*p & 0x80)) {
2168  rb_str_buf_cat(buf, p, chlen);
2169  p += chlen;
2170  if (*encp == 0)
2171  *encp = enc;
2172  else if (*encp != enc) {
2173  errcpy(err, "non ASCII character in UTF-8 regexp");
2174  return -1;
2175  }
2176  continue;
2177  }
2178 
2179  switch (c = *p++) {
2180  case '\\':
2181  if (p == end) {
2182  errcpy(err, "too short escape sequence");
2183  return -1;
2184  }
2185  switch (c = *p++) {
2186  case '1': case '2': case '3':
2187  case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
2188  {
2189  size_t octlen;
2190  if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
2191  /* backref or 7bit octal.
2192  no need to unescape anyway.
2193  re-escaping may break backref */
2194  goto escape_asis;
2195  }
2196  }
2197  /* xxx: How about more than 199 subexpressions? */
2198 
2199  case '0': /* \0, \0O, \0OO */
2200 
2201  case 'x': /* \xHH */
2202  case 'c': /* \cX, \c\M-X */
2203  case 'C': /* \C-X, \C-\M-X */
2204  case 'M': /* \M-X, \M-\C-X, \M-\cX */
2205  p = p-2;
2206  if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
2207  return -1;
2208  break;
2209 
2210  case 'u':
2211  if (p == end) {
2212  errcpy(err, "too short escape sequence");
2213  return -1;
2214  }
2215  if (*p == '{') {
2216  /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
2217  p++;
2218  if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
2219  return -1;
2220  if (p == end || *p++ != '}') {
2221  errcpy(err, "invalid Unicode list");
2222  return -1;
2223  }
2224  break;
2225  }
2226  else {
2227  /* \uHHHH */
2228  if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
2229  return -1;
2230  break;
2231  }
2232 
2233  case 'p': /* \p{Hiragana} */
2234  case 'P':
2235  if (!*encp) {
2236  *has_property = 1;
2237  }
2238  goto escape_asis;
2239 
2240  default: /* \n, \\, \d, \9, etc. */
2241 escape_asis:
2242  smallbuf[0] = '\\';
2243  smallbuf[1] = c;
2244  rb_str_buf_cat(buf, smallbuf, 2);
2245  break;
2246  }
2247  break;
2248 
2249  default:
2250  rb_str_buf_cat(buf, &c, 1);
2251  break;
2252  }
2253  }
2254 
2255  return 0;
2256 }
2257 
2258 static VALUE
2259 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
2260  rb_encoding **fixed_enc, onig_errmsg_buffer err)
2261 {
2262  VALUE buf;
2263  int has_property = 0;
2264 
2265  buf = rb_str_buf_new(0);
2266 
2267  if (rb_enc_asciicompat(enc))
2268  *fixed_enc = 0;
2269  else {
2270  *fixed_enc = enc;
2271  rb_enc_associate(buf, enc);
2272  }
2273 
2274  if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
2275  return Qnil;
2276 
2277  if (has_property && !*fixed_enc) {
2278  *fixed_enc = enc;
2279  }
2280 
2281  if (*fixed_enc) {
2282  rb_enc_associate(buf, *fixed_enc);
2283  }
2284 
2285  return buf;
2286 }
2287 
2288 VALUE
2290 {
2291  rb_encoding *fixed_enc = 0;
2292  onig_errmsg_buffer err = "";
2293  VALUE buf;
2294  char *p, *end;
2295  rb_encoding *enc;
2296 
2297  StringValue(str);
2298  p = RSTRING_PTR(str);
2299  end = p + RSTRING_LEN(str);
2300  enc = rb_enc_get(str);
2301 
2302  buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
2303  RB_GC_GUARD(str);
2304 
2305  if (buf == Qnil) {
2306  return rb_reg_error_desc(str, 0, err);
2307  }
2308  return Qnil;
2309 }
2310 
2311 static VALUE
2313 {
2314  rb_encoding *fixed_enc = 0;
2315  rb_encoding *regexp_enc = 0;
2316  onig_errmsg_buffer err = "";
2317  int i;
2318  VALUE result = 0;
2319  rb_encoding *ascii8bit = rb_ascii8bit_encoding();
2320 
2321  if (RARRAY_LEN(ary) == 0) {
2322  rb_raise(rb_eArgError, "no arguments given");
2323  }
2324 
2325  for (i = 0; i < RARRAY_LEN(ary); i++) {
2326  VALUE str = RARRAY_PTR(ary)[i];
2327  VALUE buf;
2328  char *p, *end;
2329  rb_encoding *src_enc;
2330 
2331  src_enc = rb_enc_get(str);
2332  if (options & ARG_ENCODING_NONE &&
2333  src_enc != ascii8bit) {
2335  rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2336  else
2337  src_enc = ascii8bit;
2338  }
2339 
2340  StringValue(str);
2341  p = RSTRING_PTR(str);
2342  end = p + RSTRING_LEN(str);
2343 
2344  buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
2345 
2346  if (buf == Qnil)
2347  rb_raise(rb_eArgError, "%s", err);
2348 
2349  if (fixed_enc != 0) {
2350  if (regexp_enc != 0 && regexp_enc != fixed_enc) {
2351  rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
2352  rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
2353  }
2354  regexp_enc = fixed_enc;
2355  }
2356 
2357  if (!result)
2358  result = rb_str_new3(str);
2359  else
2360  rb_str_buf_append(result, str);
2361  }
2362  if (regexp_enc) {
2363  rb_enc_associate(result, regexp_enc);
2364  }
2365 
2366  return result;
2367 }
2368 
2369 static int
2370 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
2372  const char *sourcefile, int sourceline)
2373 {
2374  struct RRegexp *re = RREGEXP(obj);
2375  VALUE unescaped;
2376  rb_encoding *fixed_enc = 0;
2378 
2379  if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4)
2380  rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
2381  rb_check_frozen(obj);
2382  if (FL_TEST(obj, REG_LITERAL))
2383  rb_raise(rb_eSecurityError, "can't modify literal regexp");
2384  if (re->ptr)
2385  rb_raise(rb_eTypeError, "already initialized regexp");
2386  re->ptr = 0;
2387 
2388  if (rb_enc_dummy_p(enc)) {
2389  errcpy(err, "can't make regexp with dummy encoding");
2390  return -1;
2391  }
2392 
2393  unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
2394  if (unescaped == Qnil)
2395  return -1;
2396 
2397  if (fixed_enc) {
2398  if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
2399  (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
2400  errcpy(err, "incompatible character encoding");
2401  return -1;
2402  }
2403  if (fixed_enc != a_enc) {
2404  options |= ARG_ENCODING_FIXED;
2405  enc = fixed_enc;
2406  }
2407  }
2408  else if (!(options & ARG_ENCODING_FIXED)) {
2409  enc = rb_usascii_encoding();
2410  }
2411 
2412  rb_enc_associate((VALUE)re, enc);
2413  if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
2414  re->basic.flags |= KCODE_FIXED;
2415  }
2416  if (options & ARG_ENCODING_NONE) {
2417  re->basic.flags |= REG_ENCODING_NONE;
2418  }
2419 
2420  re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
2421  options & ARG_REG_OPTION_MASK, err,
2422  sourcefile, sourceline);
2423  if (!re->ptr) return -1;
2424  re->src = rb_enc_str_new(s, len, enc);
2425  OBJ_FREEZE(re->src);
2426  RB_GC_GUARD(unescaped);
2427  return 0;
2428 }
2429 
2430 static int
2432  const char *sourcefile, int sourceline)
2433 {
2434  int ret;
2435  rb_encoding *enc = rb_enc_get(str);
2436  if (options & ARG_ENCODING_NONE) {
2437  rb_encoding *ascii8bit = rb_ascii8bit_encoding();
2438  if (enc != ascii8bit) {
2440  errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2441  return -1;
2442  }
2443  enc = ascii8bit;
2444  }
2445  }
2446  ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
2447  options, err, sourcefile, sourceline);
2448  OBJ_INFECT(obj, str);
2449  RB_GC_GUARD(str);
2450  return ret;
2451 }
2452 
2453 static VALUE
2455 {
2456  NEWOBJ(re, struct RRegexp);
2457  OBJSETUP(re, klass, T_REGEXP);
2458 
2459  re->ptr = 0;
2460  re->src = 0;
2461  re->usecnt = 0;
2462 
2463  return (VALUE)re;
2464 }
2465 
2466 VALUE
2468 {
2469  return rb_reg_s_alloc(rb_cRegexp);
2470 }
2471 
2472 VALUE
2474 {
2475  return rb_reg_init_str(rb_reg_alloc(), s, options);
2476 }
2477 
2478 VALUE
2480 {
2481  onig_errmsg_buffer err = "";
2482 
2483  if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
2484  rb_reg_raise_str(s, options, err);
2485  }
2486 
2487  return re;
2488 }
2489 
2490 VALUE
2491 rb_reg_new_ary(VALUE ary, int opt)
2492 {
2493  return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
2494 }
2495 
2496 VALUE
2497 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
2498 {
2499  VALUE re = rb_reg_alloc();
2500  onig_errmsg_buffer err = "";
2501 
2502  if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
2503  rb_enc_reg_raise(s, len, enc, options, err);
2504  }
2505 
2506  return re;
2507 }
2508 
2509 VALUE
2510 rb_reg_new(const char *s, long len, int options)
2511 {
2512  return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
2513 }
2514 
2515 VALUE
2516 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
2517 {
2518  VALUE re = rb_reg_alloc();
2519  onig_errmsg_buffer err = "";
2520 
2521  if (!str) str = rb_str_new(0,0);
2522  if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
2523  rb_set_errinfo(rb_reg_error_desc(str, options, err));
2524  return Qnil;
2525  }
2526  FL_SET(re, REG_LITERAL);
2527  return re;
2528 }
2529 
2531 
2532 VALUE
2534 {
2535  volatile VALUE save_str = str;
2537  && ENCODING_GET(reg_cache) == ENCODING_GET(str)
2538  && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
2539  return reg_cache;
2540 
2541  return reg_cache = rb_reg_new_str(save_str, 0);
2542 }
2543 
2544 static st_index_t reg_hash(VALUE re);
2545 /*
2546  * call-seq:
2547  * rxp.hash -> fixnum
2548  *
2549  * Produce a hash based on the text and options of this regular expression.
2550  */
2551 
2552 static VALUE
2554 {
2555  st_index_t hashval = reg_hash(re);
2556  return LONG2FIX(hashval);
2557 }
2558 
2559 static st_index_t
2561 {
2562  st_index_t hashval;
2563 
2564  rb_reg_check(re);
2565  hashval = RREGEXP(re)->ptr->options;
2566  hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
2567  return rb_hash_end(hashval);
2568 }
2569 
2570 
2571 /*
2572  * call-seq:
2573  * rxp == other_rxp -> true or false
2574  * rxp.eql?(other_rxp) -> true or false
2575  *
2576  * Equality---Two regexps are equal if their patterns are identical, they have
2577  * the same character set code, and their <code>casefold?</code> values are the
2578  * same.
2579  *
2580  * /abc/ == /abc/x #=> false
2581  * /abc/ == /abc/i #=> false
2582  * /abc/ == /abc/n #=> false
2583  * /abc/u == /abc/n #=> false
2584  */
2585 
2586 static VALUE
2588 {
2589  if (re1 == re2) return Qtrue;
2590  if (TYPE(re2) != T_REGEXP) return Qfalse;
2591  rb_reg_check(re1); rb_reg_check(re2);
2592  if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
2593  if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
2594  if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
2595  if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
2596  if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
2597  return Qtrue;
2598  }
2599  return Qfalse;
2600 }
2601 
2602 /*
2603  * call-seq:
2604  * mtch.hash -> integer
2605  *
2606  * Produce a hash based on the target string, regexp and matched
2607  * positions of this matchdata.
2608  */
2609 
2610 static VALUE
2612 {
2613  const struct re_registers *regs;
2614  st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
2615 
2616  rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp));
2617  regs = RMATCH_REGS(match);
2618  hashval = rb_hash_uint(hashval, regs->num_regs);
2619  hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
2620  hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
2621  hashval = rb_hash_end(hashval);
2622  return LONG2FIX(hashval);
2623 }
2624 
2625 /*
2626  * call-seq:
2627  * mtch == mtch2 -> true or false
2628  *
2629  * Equality---Two matchdata are equal if their target strings,
2630  * patterns, and matched positions are identical.
2631  */
2632 
2633 static VALUE
2634 match_equal(VALUE match1, VALUE match2)
2635 {
2636  const struct re_registers *regs1, *regs2;
2637  if (match1 == match2) return Qtrue;
2638  if (TYPE(match2) != T_MATCH) return Qfalse;
2639  if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
2640  if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse;
2641  regs1 = RMATCH_REGS(match1);
2642  regs2 = RMATCH_REGS(match2);
2643  if (regs1->num_regs != regs2->num_regs) return Qfalse;
2644  if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
2645  if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
2646  return Qtrue;
2647 }
2648 
2649 static VALUE
2650 reg_operand(VALUE s, int check)
2651 {
2652  if (SYMBOL_P(s)) {
2653  return rb_sym_to_s(s);
2654  }
2655  else {
2656  VALUE tmp = rb_check_string_type(s);
2657  if (check && NIL_P(tmp)) {
2658  rb_raise(rb_eTypeError, "can't convert %s to String",
2659  rb_obj_classname(s));
2660  }
2661  return tmp;
2662  }
2663 }
2664 
2665 static long
2666 reg_match_pos(VALUE re, VALUE *strp, long pos)
2667 {
2668  VALUE str = *strp;
2669 
2670  if (NIL_P(str)) {
2672  return -1;
2673  }
2674  *strp = str = reg_operand(str, TRUE);
2675  if (pos != 0) {
2676  if (pos < 0) {
2677  VALUE l = rb_str_length(str);
2678  pos += NUM2INT(l);
2679  if (pos < 0) {
2680  return pos;
2681  }
2682  }
2683  pos = rb_str_offset(str, pos);
2684  }
2685  return rb_reg_search(re, str, pos, 0);
2686 }
2687 
2688 /*
2689  * call-seq:
2690  * rxp =~ str -> integer or nil
2691  *
2692  * Match---Matches <i>rxp</i> against <i>str</i>.
2693  *
2694  * /at/ =~ "input data" #=> 7
2695  * /ax/ =~ "input data" #=> nil
2696  *
2697  * If <code>=~</code> is used with a regexp literal with named captures,
2698  * captured strings (or nil) is assigned to local variables named by
2699  * the capture names.
2700  *
2701  * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = y "
2702  * p lhs #=> "x"
2703  * p rhs #=> "y"
2704  *
2705  * If it is not matched, nil is assigned for the variables.
2706  *
2707  * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = "
2708  * p lhs #=> nil
2709  * p rhs #=> nil
2710  *
2711  * This assignment is implemented in the Ruby parser.
2712  * The parser detects 'regexp-literal =~ expression' for the assignment.
2713  * The regexp must be a literal without interpolation and placed at left hand side.
2714  *
2715  * The assignment does not occur if the regexp is not a literal.
2716  *
2717  * re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
2718  * re =~ " x = y "
2719  * p lhs # undefined local variable
2720  * p rhs # undefined local variable
2721  *
2722  * A regexp interpolation, <code>#{}</code>, also disables
2723  * the assignment.
2724  *
2725  * rhs_pat = /(?<rhs>\w+)/
2726  * /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
2727  * p lhs # undefined local variable
2728  *
2729  * The assignment does not occur if the regexp is placed at the right hand side.
2730  *
2731  * " x = y " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
2732  * p lhs, rhs # undefined local variable
2733  *
2734  */
2735 
2736 VALUE
2738 {
2739  long pos = reg_match_pos(re, &str, 0);
2740  if (pos < 0) return Qnil;
2741  pos = rb_str_sublen(str, pos);
2742  return LONG2FIX(pos);
2743 }
2744 
2745 /*
2746  * call-seq:
2747  * rxp === str -> true or false
2748  *
2749  * Case Equality---Synonym for <code>Regexp#=~</code> used in case statements.
2750  *
2751  * a = "HELLO"
2752  * case a
2753  * when /^[a-z]*$/; print "Lower case\n"
2754  * when /^[A-Z]*$/; print "Upper case\n"
2755  * else; print "Mixed case\n"
2756  * end
2757  *
2758  * <em>produces:</em>
2759  *
2760  * Upper case
2761  */
2762 
2763 VALUE
2765 {
2766  long start;
2767 
2768  str = reg_operand(str, FALSE);
2769  if (NIL_P(str)) {
2771  return Qfalse;
2772  }
2773  start = rb_reg_search(re, str, 0, 0);
2774  if (start < 0) {
2775  return Qfalse;
2776  }
2777  return Qtrue;
2778 }
2779 
2780 
2781 /*
2782  * call-seq:
2783  * ~ rxp -> integer or nil
2784  *
2785  * Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
2786  * Equivalent to <code><i>rxp</i> =~ $_</code>.
2787  *
2788  * $_ = "input data"
2789  * ~ /at/ #=> 7
2790  */
2791 
2792 VALUE
2794 {
2795  long start;
2796  VALUE line = rb_lastline_get();
2797 
2798  if (TYPE(line) != T_STRING) {
2800  return Qnil;
2801  }
2802 
2803  start = rb_reg_search(re, line, 0, 0);
2804  if (start < 0) {
2805  return Qnil;
2806  }
2807  start = rb_str_sublen(line, start);
2808  return LONG2FIX(start);
2809 }
2810 
2811 
2812 /*
2813  * call-seq:
2814  * rxp.match(str) -> matchdata or nil
2815  * rxp.match(str,pos) -> matchdata or nil
2816  *
2817  * Returns a <code>MatchData</code> object describing the match, or
2818  * <code>nil</code> if there was no match. This is equivalent to retrieving the
2819  * value of the special variable <code>$~</code> following a normal match.
2820  * If the second parameter is present, it specifies the position in the string
2821  * to begin the search.
2822  *
2823  * /(.)(.)(.)/.match("abc")[2] #=> "b"
2824  * /(.)(.)/.match("abc", 1)[2] #=> "c"
2825  *
2826  * If a block is given, invoke the block with MatchData if match succeed, so
2827  * that you can write
2828  *
2829  * pat.match(str) {|m| ...}
2830  *
2831  * instead of
2832  *
2833  * if m = pat.match(str)
2834  * ...
2835  * end
2836  *
2837  * The return value is a value from block execution in this case.
2838  */
2839 
2840 static VALUE
2842 {
2843  VALUE result, str, initpos;
2844  long pos;
2845 
2846  if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
2847  pos = NUM2LONG(initpos);
2848  }
2849  else {
2850  pos = 0;
2851  }
2852 
2853  pos = reg_match_pos(re, &str, pos);
2854  if (pos < 0) {
2856  return Qnil;
2857  }
2858  result = rb_backref_get();
2859  rb_match_busy(result);
2860  if (!NIL_P(result) && rb_block_given_p()) {
2861  return rb_yield(result);
2862  }
2863  return result;
2864 }
2865 
2866 /*
2867  * Document-method: compile
2868  *
2869  * Synonym for <code>Regexp.new</code>
2870  */
2871 
2872 
2873 /*
2874  * call-seq:
2875  * Regexp.new(string, [options [, lang]]) -> regexp
2876  * Regexp.new(regexp) -> regexp
2877  * Regexp.compile(string, [options [, lang]]) -> regexp
2878  * Regexp.compile(regexp) -> regexp
2879  *
2880  * Constructs a new regular expression from <i>pattern</i>, which can be either
2881  * a <code>String</code> or a <code>Regexp</code> (in which case that regexp's
2882  * options are propagated, and new options may not be specified (a change as of
2883  * Ruby 1.8). If <i>options</i> is a <code>Fixnum</code>, it should be one or
2884  * more of the constants <code>Regexp::EXTENDED</code>,
2885  * <code>Regexp::IGNORECASE</code>, and <code>Regexp::MULTILINE</code>,
2886  * <em>or</em>-ed together. Otherwise, if <i>options</i> is not
2887  * <code>nil</code>, the regexp will be case insensitive.
2888  * When the <i>lang</i> parameter is `n' or `N' sets the regexp no encoding.
2889  *
2890  * r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/
2891  * r2 = Regexp.new('cat', true) #=> /cat/i
2892  * r3 = Regexp.new('dog', Regexp::EXTENDED) #=> /dog/x
2893  * r4 = Regexp.new(r2) #=> /cat/i
2894  */
2895 
2896 static VALUE
2898 {
2899  onig_errmsg_buffer err = "";
2900  int flags = 0;
2901  VALUE str;
2902  rb_encoding *enc;
2903  const char *ptr;
2904  long len;
2905 
2906  if (argc == 0 || argc > 3) {
2907  rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..3)", argc);
2908  }
2909  if (TYPE(argv[0]) == T_REGEXP) {
2910  VALUE re = argv[0];
2911 
2912  if (argc > 1) {
2913  rb_warn("flags ignored");
2914  }
2915  rb_reg_check(re);
2916  flags = rb_reg_options(re);
2917  ptr = RREGEXP_SRC_PTR(re);
2918  len = RREGEXP_SRC_LEN(re);
2919  enc = rb_enc_get(re);
2920  if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) {
2921  str = rb_enc_str_new(ptr, len, enc);
2922  rb_reg_raise_str(str, flags, err);
2923  }
2924  }
2925  else {
2926  if (argc >= 2) {
2927  if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
2928  else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
2929  }
2930  enc = 0;
2931  if (argc == 3 && !NIL_P(argv[2])) {
2932  char *kcode = StringValuePtr(argv[2]);
2933  if (kcode[0] == 'n' || kcode[0] == 'N') {
2934  enc = rb_ascii8bit_encoding();
2935  flags |= ARG_ENCODING_NONE;
2936  }
2937  else {
2938  rb_warn("encoding option is ignored - %s", kcode);
2939  }
2940  }
2941  str = argv[0];
2942  ptr = StringValuePtr(str);
2943  if (enc
2944  ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0)
2945  : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) {
2946  rb_reg_raise_str(str, flags, err);
2947  }
2948  }
2949  return self;
2950 }
2951 
2952 VALUE
2954 {
2955  rb_encoding *enc = rb_enc_get(str);
2956  char *s, *send, *t;
2957  VALUE tmp;
2958  int c, clen;
2959  int ascii_only = rb_enc_str_asciionly_p(str);
2960 
2961  s = RSTRING_PTR(str);
2962  send = s + RSTRING_LEN(str);
2963  while (s < send) {
2964  c = rb_enc_ascget(s, send, &clen, enc);
2965  if (c == -1) {
2966  s += mbclen(s, send, enc);
2967  continue;
2968  }
2969  switch (c) {
2970  case '[': case ']': case '{': case '}':
2971  case '(': case ')': case '|': case '-':
2972  case '*': case '.': case '\\':
2973  case '?': case '+': case '^': case '$':
2974  case ' ': case '#':
2975  case '\t': case '\f': case '\v': case '\n': case '\r':
2976  goto meta_found;
2977  }
2978  s += clen;
2979  }
2980  tmp = rb_str_new3(str);
2981  if (ascii_only) {
2983  }
2984  return tmp;
2985 
2986  meta_found:
2987  tmp = rb_str_new(0, RSTRING_LEN(str)*2);
2988  if (ascii_only) {
2990  }
2991  else {
2992  rb_enc_copy(tmp, str);
2993  }
2994  t = RSTRING_PTR(tmp);
2995  /* copy upto metacharacter */
2996  memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
2997  t += s - RSTRING_PTR(str);
2998 
2999  while (s < send) {
3000  c = rb_enc_ascget(s, send, &clen, enc);
3001  if (c == -1) {
3002  int n = mbclen(s, send, enc);
3003 
3004  while (n--)
3005  *t++ = *s++;
3006  continue;
3007  }
3008  s += clen;
3009  switch (c) {
3010  case '[': case ']': case '{': case '}':
3011  case '(': case ')': case '|': case '-':
3012  case '*': case '.': case '\\':
3013  case '?': case '+': case '^': case '$':
3014  case '#':
3015  t += rb_enc_mbcput('\\', t, enc);
3016  break;
3017  case ' ':
3018  t += rb_enc_mbcput('\\', t, enc);
3019  t += rb_enc_mbcput(' ', t, enc);
3020  continue;
3021  case '\t':
3022  t += rb_enc_mbcput('\\', t, enc);
3023  t += rb_enc_mbcput('t', t, enc);
3024  continue;
3025  case '\n':
3026  t += rb_enc_mbcput('\\', t, enc);
3027  t += rb_enc_mbcput('n', t, enc);
3028  continue;
3029  case '\r':
3030  t += rb_enc_mbcput('\\', t, enc);
3031  t += rb_enc_mbcput('r', t, enc);
3032  continue;
3033  case '\f':
3034  t += rb_enc_mbcput('\\', t, enc);
3035  t += rb_enc_mbcput('f', t, enc);
3036  continue;
3037  case '\v':
3038  t += rb_enc_mbcput('\\', t, enc);
3039  t += rb_enc_mbcput('v', t, enc);
3040  continue;
3041  }
3042  t += rb_enc_mbcput(c, t, enc);
3043  }
3044  rb_str_resize(tmp, t - RSTRING_PTR(tmp));
3045  OBJ_INFECT(tmp, str);
3046  return tmp;
3047 }
3048 
3049 
3050 /*
3051  * call-seq:
3052  * Regexp.escape(str) -> string
3053  * Regexp.quote(str) -> string
3054  *
3055  * Escapes any characters that would have special meaning in a regular
3056  * expression. Returns a new escaped string, or self if no characters are
3057  * escaped. For any string,
3058  * <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
3059  *
3060  * Regexp.escape('\*?{}.') #=> \\\*\?\{\}\.
3061  *
3062  */
3063 
3064 static VALUE
3066 {
3067  return rb_reg_quote(reg_operand(str, TRUE));
3068 }
3069 
3070 int
3072 {
3073  int options;
3074 
3075  rb_reg_check(re);
3076  options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
3077  if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
3078  if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
3079  return options;
3080 }
3081 
3082 VALUE
3084 {
3085  return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
3086 }
3087 
3088 /*
3089  * call-seq:
3090  * Regexp.try_convert(obj) -> re or nil
3091  *
3092  * Try to convert <i>obj</i> into a Regexp, using to_regexp method.
3093  * Returns converted regexp or nil if <i>obj</i> cannot be converted
3094  * for any reason.
3095  *
3096  * Regexp.try_convert(/re/) #=> /re/
3097  * Regexp.try_convert("re") #=> nil
3098  *
3099  * o = Object.new
3100  * Regexp.try_convert(o) #=> nil
3101  * def o.to_regexp() /foo/ end
3102  * Regexp.try_convert(o) #=> /foo/
3103  *
3104  */
3105 static VALUE
3107 {
3108  return rb_check_regexp_type(re);
3109 }
3110 
3111 static VALUE
3113 {
3114  long argc = RARRAY_LEN(args0);
3115 
3116  if (argc == 0) {
3117  VALUE args[1];
3118  args[0] = rb_str_new2("(?!)");
3119  return rb_class_new_instance(1, args, rb_cRegexp);
3120  }
3121  else if (argc == 1) {
3122  VALUE arg = rb_ary_entry(args0, 0);
3123  VALUE re = rb_check_regexp_type(arg);
3124  if (!NIL_P(re))
3125  return re;
3126  else {
3127  VALUE quoted;
3128  quoted = rb_reg_s_quote(Qnil, arg);
3129  return rb_reg_new_str(quoted, 0);
3130  }
3131  }
3132  else {
3133  int i;
3134  VALUE source = rb_str_buf_new(0);
3135  rb_encoding *result_enc;
3136 
3137  int has_asciionly = 0;
3138  rb_encoding *has_ascii_compat_fixed = 0;
3139  rb_encoding *has_ascii_incompat = 0;
3140 
3141  for (i = 0; i < argc; i++) {
3142  volatile VALUE v;
3143  VALUE e = rb_ary_entry(args0, i);
3144 
3145  if (0 < i)
3146  rb_str_buf_cat_ascii(source, "|");
3147 
3148  v = rb_check_regexp_type(e);
3149  if (!NIL_P(v)) {
3150  rb_encoding *enc = rb_enc_get(v);
3151  if (!rb_enc_asciicompat(enc)) {
3152  if (!has_ascii_incompat)
3153  has_ascii_incompat = enc;
3154  else if (has_ascii_incompat != enc)
3155  rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3156  rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
3157  }
3158  else if (rb_reg_fixed_encoding_p(v)) {
3159  if (!has_ascii_compat_fixed)
3160  has_ascii_compat_fixed = enc;
3161  else if (has_ascii_compat_fixed != enc)
3162  rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3163  rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
3164  }
3165  else {
3166  has_asciionly = 1;
3167  }
3168  v = rb_reg_to_s(v);
3169  }
3170  else {
3171  rb_encoding *enc;
3172  StringValue(e);
3173  enc = rb_enc_get(e);
3174  if (!rb_enc_str_asciicompat_p(e)) {
3175  if (!has_ascii_incompat)
3176  has_ascii_incompat = enc;
3177  else if (has_ascii_incompat != enc)
3178  rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3179  rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
3180  }
3181  else if (rb_enc_str_asciionly_p(e)) {
3182  has_asciionly = 1;
3183  }
3184  else {
3185  if (!has_ascii_compat_fixed)
3186  has_ascii_compat_fixed = enc;
3187  else if (has_ascii_compat_fixed != enc)
3188  rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3189  rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
3190  }
3191  v = rb_reg_s_quote(Qnil, e);
3192  }
3193  if (has_ascii_incompat) {
3194  if (has_asciionly) {
3195  rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
3196  rb_enc_name(has_ascii_incompat));
3197  }
3198  if (has_ascii_compat_fixed) {
3199  rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3200  rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
3201  }
3202  }
3203 
3204  if (i == 0) {
3205  rb_enc_copy(source, v);
3206  }
3207  rb_str_append(source, v);
3208  }
3209 
3210  if (has_ascii_incompat) {
3211  result_enc = has_ascii_incompat;
3212  }
3213  else if (has_ascii_compat_fixed) {
3214  result_enc = has_ascii_compat_fixed;
3215  }
3216  else {
3217  result_enc = rb_ascii8bit_encoding();
3218  }
3219 
3220  rb_enc_associate(source, result_enc);
3221  return rb_class_new_instance(1, &source, rb_cRegexp);
3222  }
3223 }
3224 
3225 /*
3226  * call-seq:
3227  * Regexp.union(pat1, pat2, ...) -> new_regexp
3228  * Regexp.union(pats_ary) -> new_regexp
3229  *
3230  * Return a <code>Regexp</code> object that is the union of the given
3231  * <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s
3232  * can be Regexp objects, in which case their options will be preserved, or
3233  * Strings. If no patterns are given, returns <code>/(?!)/</code>.
3234  * The behavior is unspecified if any given <em>pattern</em> contains capture.
3235  *
3236  * Regexp.union #=> /(?!)/
3237  * Regexp.union("penzance") #=> /penzance/
3238  * Regexp.union("a+b*c") #=> /a\+b\*c/
3239  * Regexp.union("skiing", "sledding") #=> /skiing|sledding/
3240  * Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
3241  * Regexp.union(/dogs/, /cats/i) #=> /(?-mix:dogs)|(?i-mx:cats)/
3242  */
3243 static VALUE
3245 {
3246  VALUE v;
3247  if (RARRAY_LEN(args) == 1 &&
3248  !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
3249  return rb_reg_s_union(self, v);
3250  }
3251  return rb_reg_s_union(self, args);
3252 }
3253 
3254 /* :nodoc: */
3255 static VALUE
3257 {
3258  onig_errmsg_buffer err = "";
3259  const char *s;
3260  long len;
3261 
3262  if (copy == re) return copy;
3263  rb_check_frozen(copy);
3264  /* need better argument type check */
3265  if (!rb_obj_is_instance_of(re, rb_obj_class(copy))) {
3266  rb_raise(rb_eTypeError, "wrong argument type");
3267  }
3268  rb_reg_check(re);
3269  s = RREGEXP_SRC_PTR(re);
3270  len = RREGEXP_SRC_LEN(re);
3271  if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re),
3272  err, NULL, 0) != 0) {
3273  rb_reg_raise(s, len, err, re);
3274  }
3275  return copy;
3276 }
3277 
3278 VALUE
3279 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
3280 {
3281  VALUE val = 0;
3282  char *p, *s, *e;
3283  int no, clen;
3284  rb_encoding *str_enc = rb_enc_get(str);
3285  rb_encoding *src_enc = rb_enc_get(src);
3286  int acompat = rb_enc_asciicompat(str_enc);
3287 #define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
3288 
3289  p = s = RSTRING_PTR(str);
3290  e = s + RSTRING_LEN(str);
3291 
3292  while (s < e) {
3293  int c = ASCGET(s, e, &clen);
3294  char *ss;
3295 
3296  if (c == -1) {
3297  s += mbclen(s, e, str_enc);
3298  continue;
3299  }
3300  ss = s;
3301  s += clen;
3302 
3303  if (c != '\\' || s == e) continue;
3304 
3305  if (!val) {
3306  val = rb_str_buf_new(ss-p);
3307  }
3308  rb_enc_str_buf_cat(val, p, ss-p, str_enc);
3309 
3310  c = ASCGET(s, e, &clen);
3311  if (c == -1) {
3312  s += mbclen(s, e, str_enc);
3313  rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3314  p = s;
3315  continue;
3316  }
3317  s += clen;
3318 
3319  p = s;
3320  switch (c) {
3321  case '1': case '2': case '3': case '4':
3322  case '5': case '6': case '7': case '8': case '9':
3323  if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
3324  no = c - '0';
3325  }
3326  else {
3327  continue;
3328  }
3329  break;
3330 
3331  case 'k':
3332  if (s < e && ASCGET(s, e, &clen) == '<') {
3333  char *name, *name_end;
3334 
3335  name_end = name = s + clen;
3336  while (name_end < e) {
3337  c = ASCGET(name_end, e, &clen);
3338  if (c == '>') break;
3339  name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
3340  }
3341  if (name_end < e) {
3342  no = name_to_backref_number(regs, regexp, name, name_end);
3343  p = s = name_end + clen;
3344  break;
3345  }
3346  else {
3347  rb_raise(rb_eRuntimeError, "invalid group name reference format");
3348  }
3349  }
3350 
3351  rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3352  continue;
3353 
3354  case '0':
3355  case '&':
3356  no = 0;
3357  break;
3358 
3359  case '`':
3360  rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
3361  continue;
3362 
3363  case '\'':
3364  rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
3365  continue;
3366 
3367  case '+':
3368  no = regs->num_regs-1;
3369  while (BEG(no) == -1 && no > 0) no--;
3370  if (no == 0) continue;
3371  break;
3372 
3373  case '\\':
3374  rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
3375  continue;
3376 
3377  default:
3378  rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3379  continue;
3380  }
3381 
3382  if (no >= 0) {
3383  if (no >= regs->num_regs) continue;
3384  if (BEG(no) == -1) continue;
3385  rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
3386  }
3387  }
3388 
3389  if (!val) return str;
3390  if (p < e) {
3391  rb_enc_str_buf_cat(val, p, e-p, str_enc);
3392  }
3393 
3394  return val;
3395 }
3396 
3397 static VALUE
3399 {
3400  rb_warn("variable $KCODE is no longer effective");
3401  return Qnil;
3402 }
3403 
3404 static void
3406 {
3407  rb_warn("variable $KCODE is no longer effective; ignored");
3408 }
3409 
3410 static VALUE
3412 {
3413  rb_warn("variable $= is no longer effective");
3414  return Qfalse;
3415 }
3416 
3417 static void
3419 {
3420  rb_warn("variable $= is no longer effective; ignored");
3421 }
3422 
3423 static VALUE
3425 {
3427 
3428  if (NIL_P(match)) return Qnil;
3429  rb_match_busy(match);
3430  return match;
3431 }
3432 
3433 static void
3435 {
3436  if (!NIL_P(val)) {
3437  Check_Type(val, T_MATCH);
3438  }
3439  rb_backref_set(val);
3440 }
3441 
3442 /*
3443  * call-seq:
3444  * Regexp.last_match -> matchdata
3445  * Regexp.last_match(n) -> str
3446  *
3447  * The first form returns the <code>MatchData</code> object generated by the
3448  * last successful pattern match. Equivalent to reading the global variable
3449  * <code>$~</code>. The second form returns the <i>n</i>th field in this
3450  * <code>MatchData</code> object.
3451  * <em>n</em> can be a string or symbol to reference a named capture.
3452  *
3453  * Note that the <code>last_match</code> is local to the thread and method scope
3454  * of the method that did the pattern match.
3455  *
3456  * /c(.)t/ =~ 'cat' #=> 0
3457  * Regexp.last_match #=> #<MatchData "cat" 1:"a">
3458  * Regexp.last_match(0) #=> "cat"
3459  * Regexp.last_match(1) #=> "a"
3460  * Regexp.last_match(2) #=> nil
3461  *
3462  * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
3463  * Regexp.last_match #=> #<MatchData "var = val" lhs:"var" rhs:"val">
3464  * Regexp.last_match(:lhs) #=> "var"
3465  * Regexp.last_match(:rhs) #=> "val"
3466  */
3467 
3468 static VALUE
3470 {
3471  VALUE nth;
3472 
3473  if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
3475  int n;
3476  if (NIL_P(match)) return Qnil;
3477  n = match_backref_number(match, nth);
3478  return rb_reg_nth_match(n, match);
3479  }
3480  return match_getter();
3481 }
3482 
3483 static void
3484 re_warn(const char *s)
3485 {
3486  rb_warn("%s", s);
3487 }
3488 
3489 /*
3490  * Document-class: RegexpError
3491  *
3492  * Raised when given an invalid regexp expression.
3493  *
3494  * Regexp.new("?")
3495  *
3496  * <em>raises the exception:</em>
3497  *
3498  * RegexpError: target of repeat operator is not specified: /?/
3499  */
3500 
3501 /*
3502  * Document-class: Regexp
3503  *
3504  * A <code>Regexp</code> holds a regular expression, used to match a pattern
3505  * against strings. Regexps are created using the <code>/.../</code> and
3506  * <code>%r{...}</code> literals, and by the <code>Regexp::new</code>
3507  * constructor.
3508  *
3509  * :include: doc/re.rdoc
3510  */
3511 
3512 void
3514 {
3516 
3521 
3527 
3531 
3532  rb_cRegexp = rb_define_class("Regexp", rb_cObject);
3540 
3541  rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
3542  rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
3551  rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
3552  rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
3553  rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
3555  rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */
3556  rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
3558  rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
3559 
3560  /* see Regexp.options and Regexp.new */
3562  /* see Regexp.options and Regexp.new */
3564  /* see Regexp.options and Regexp.new */
3566  /* see Regexp.options and Regexp.new */
3568  /* see Regexp.options and Regexp.new */
3570 
3572 
3573  rb_cMatch = rb_define_class("MatchData", rb_cObject);
3576 
3577  rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
3578  rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
3579  rb_define_method(rb_cMatch, "names", match_names, 0);
3580  rb_define_method(rb_cMatch, "size", match_size, 0);
3581  rb_define_method(rb_cMatch, "length", match_size, 0);
3582  rb_define_method(rb_cMatch, "offset", match_offset, 1);
3583  rb_define_method(rb_cMatch, "begin", match_begin, 1);
3584  rb_define_method(rb_cMatch, "end", match_end, 1);
3585  rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
3587  rb_define_method(rb_cMatch, "captures", match_captures, 0);
3588  rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
3589  rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
3590  rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
3591  rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
3592  rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
3593  rb_define_method(rb_cMatch, "string", match_string, 0);
3594  rb_define_method(rb_cMatch, "hash", match_hash, 0);
3595  rb_define_method(rb_cMatch, "eql?", match_equal, 1);
3597 }
3598