• Main Page
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

ext/strscan/strscan.c

Go to the documentation of this file.
00001 /*
00002     $Id: strscan.c 27437 2010-04-22 08:04:13Z nobu $
00003 
00004     Copyright (c) 1999-2006 Minero Aoki
00005 
00006     This program is free software.
00007     You can distribute/modify this program under the terms of
00008     the Ruby License. For details, see the file COPYING.
00009 */
00010 
00011 #include "ruby/ruby.h"
00012 #include "ruby/re.h"
00013 #include "ruby/encoding.h"
00014 
00015 #define STRSCAN_VERSION "0.7.0"
00016 
00017 /* =======================================================================
00018                          Data Type Definitions
00019    ======================================================================= */
00020 
00021 static VALUE StringScanner;
00022 static VALUE ScanError;
00023 
00024 struct strscanner
00025 {
00026     /* multi-purpose flags */
00027     unsigned long flags;
00028 #define FLAG_MATCHED (1 << 0)
00029 
00030     /* the string to scan */
00031     VALUE str;
00032 
00033     /* scan pointers */
00034     long prev;   /* legal only when MATCHED_P(s) */
00035     long curr;   /* always legal */
00036 
00037     /* the regexp register; legal only when MATCHED_P(s) */
00038     struct re_registers regs;
00039 };
00040 
00041 #define MATCHED_P(s)          ((s)->flags & FLAG_MATCHED)
00042 #define MATCHED(s)             (s)->flags |= FLAG_MATCHED
00043 #define CLEAR_MATCH_STATUS(s)  (s)->flags &= ~FLAG_MATCHED
00044 
00045 #define S_PBEG(s)  (RSTRING_PTR((s)->str))
00046 #define S_LEN(s)  (RSTRING_LEN((s)->str))
00047 #define S_PEND(s)  (S_PBEG(s) + S_LEN(s))
00048 #define CURPTR(s) (S_PBEG(s) + (s)->curr)
00049 #define S_RESTLEN(s) (S_LEN(s) - (s)->curr)
00050 
00051 #define EOS_P(s) ((s)->curr >= RSTRING_LEN(p->str))
00052 
00053 #define GET_SCANNER(obj,var) do {\
00054     Data_Get_Struct(obj, struct strscanner, var);\
00055     if (NIL_P(var->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");\
00056 } while (0)
00057 
00058 /* =======================================================================
00059                             Function Prototypes
00060    ======================================================================= */
00061 
00062 static VALUE infect _((VALUE str, struct strscanner *p));
00063 static VALUE extract_range _((struct strscanner *p, long beg_i, long end_i));
00064 static VALUE extract_beg_len _((struct strscanner *p, long beg_i, long len));
00065 
00066 void check_strscan _((VALUE obj));
00067 static void strscan_mark _((struct strscanner *p));
00068 static void strscan_free _((struct strscanner *p));
00069 static VALUE strscan_s_allocate _((VALUE klass));
00070 static VALUE strscan_initialize _((int argc, VALUE *argv, VALUE self));
00071 static VALUE strscan_init_copy _((VALUE vself, VALUE vorig));
00072 
00073 static VALUE strscan_s_mustc _((VALUE self));
00074 static VALUE strscan_terminate _((VALUE self));
00075 static VALUE strscan_clear _((VALUE self));
00076 static VALUE strscan_get_string _((VALUE self));
00077 static VALUE strscan_set_string _((VALUE self, VALUE str));
00078 static VALUE strscan_concat _((VALUE self, VALUE str));
00079 static VALUE strscan_get_pos _((VALUE self));
00080 static VALUE strscan_set_pos _((VALUE self, VALUE pos));
00081 static VALUE strscan_do_scan _((VALUE self, VALUE regex,
00082                                 int succptr, int getstr, int headonly));
00083 static VALUE strscan_scan _((VALUE self, VALUE re));
00084 static VALUE strscan_match_p _((VALUE self, VALUE re));
00085 static VALUE strscan_skip _((VALUE self, VALUE re));
00086 static VALUE strscan_check _((VALUE self, VALUE re));
00087 static VALUE strscan_scan_full _((VALUE self, VALUE re,
00088                                   VALUE succp, VALUE getp));
00089 static VALUE strscan_scan_until _((VALUE self, VALUE re));
00090 static VALUE strscan_skip_until _((VALUE self, VALUE re));
00091 static VALUE strscan_check_until _((VALUE self, VALUE re));
00092 static VALUE strscan_search_full _((VALUE self, VALUE re,
00093                                     VALUE succp, VALUE getp));
00094 static void adjust_registers_to_matched _((struct strscanner *p));
00095 static VALUE strscan_getch _((VALUE self));
00096 static VALUE strscan_get_byte _((VALUE self));
00097 static VALUE strscan_getbyte _((VALUE self));
00098 static VALUE strscan_peek _((VALUE self, VALUE len));
00099 static VALUE strscan_peep _((VALUE self, VALUE len));
00100 static VALUE strscan_unscan _((VALUE self));
00101 static VALUE strscan_bol_p _((VALUE self));
00102 static VALUE strscan_eos_p _((VALUE self));
00103 static VALUE strscan_empty_p _((VALUE self));
00104 static VALUE strscan_rest_p _((VALUE self));
00105 static VALUE strscan_matched_p _((VALUE self));
00106 static VALUE strscan_matched _((VALUE self));
00107 static VALUE strscan_matched_size _((VALUE self));
00108 static VALUE strscan_aref _((VALUE self, VALUE idx));
00109 static VALUE strscan_pre_match _((VALUE self));
00110 static VALUE strscan_post_match _((VALUE self));
00111 static VALUE strscan_rest _((VALUE self));
00112 static VALUE strscan_rest_size _((VALUE self));
00113 
00114 static VALUE strscan_inspect _((VALUE self));
00115 static VALUE inspect1 _((struct strscanner *p));
00116 static VALUE inspect2 _((struct strscanner *p));
00117 
00118 /* =======================================================================
00119                                    Utils
00120    ======================================================================= */
00121 
00122 static VALUE
00123 infect(VALUE str, struct strscanner *p)
00124 {
00125     OBJ_INFECT(str, p->str);
00126     return str;
00127 }
00128 
00129 static VALUE
00130 str_new(struct strscanner *p, const char *ptr, long len)
00131 {
00132     VALUE str = rb_str_new(ptr, len);
00133     rb_enc_copy(str, p->str);
00134     return str;
00135 }
00136 
00137 static VALUE
00138 extract_range(struct strscanner *p, long beg_i, long end_i)
00139 {
00140     if (beg_i > S_LEN(p)) return Qnil;
00141     if (end_i > S_LEN(p))
00142         end_i = S_LEN(p);
00143     return infect(str_new(p, S_PBEG(p) + beg_i, end_i - beg_i), p);
00144 }
00145 
00146 static VALUE
00147 extract_beg_len(struct strscanner *p, long beg_i, long len)
00148 {
00149     if (beg_i > S_LEN(p)) return Qnil;
00150     if (beg_i + len > S_LEN(p))
00151         len = S_LEN(p) - beg_i;
00152     return infect(str_new(p, S_PBEG(p) + beg_i, len), p);
00153 }
00154 
00155 /* =======================================================================
00156                                Constructor
00157    ======================================================================= */
00158 
00159 static void
00160 strscan_mark(struct strscanner *p)
00161 {
00162     rb_gc_mark(p->str);
00163 }
00164 
00165 static void
00166 strscan_free(struct strscanner *p)
00167 {
00168     onig_region_free(&(p->regs), 0);
00169     ruby_xfree(p);
00170 }
00171 
00172 static VALUE
00173 strscan_s_allocate(VALUE klass)
00174 {
00175     struct strscanner *p;
00176 
00177     p = ALLOC(struct strscanner);
00178     MEMZERO(p, struct strscanner, 1);
00179     CLEAR_MATCH_STATUS(p);
00180     onig_region_init(&(p->regs));
00181     p->str = Qnil;
00182     return Data_Wrap_Struct(klass, strscan_mark, strscan_free, p);
00183 }
00184 
00185 /*
00186  * call-seq: StringScanner.new(string, dup = false)
00187  *
00188  * Creates a new StringScanner object to scan over the given +string+.
00189  * +dup+ argument is obsolete and not used now.
00190  */
00191 static VALUE
00192 strscan_initialize(int argc, VALUE *argv, VALUE self)
00193 {
00194     struct strscanner *p;
00195     VALUE str, need_dup;
00196 
00197     Data_Get_Struct(self, struct strscanner, p);
00198     rb_scan_args(argc, argv, "11", &str, &need_dup);
00199     StringValue(str);
00200     p->str = str;
00201 
00202     return self;
00203 }
00204 
00205 void
00206 check_strscan(VALUE obj)
00207 {
00208     if (TYPE(obj) != T_DATA || RDATA(obj)->dmark != (RUBY_DATA_FUNC)strscan_mark) {
00209         rb_raise(rb_eTypeError,
00210                  "wrong argument type %s (expected StringScanner)",
00211                  rb_obj_classname(obj));
00212     }
00213 }
00214 
00215 /*
00216  * call-seq:
00217  *   dup
00218  *   clone
00219  *
00220  * Duplicates a StringScanner object.
00221  */
00222 static VALUE
00223 strscan_init_copy(VALUE vself, VALUE vorig)
00224 {
00225     struct strscanner *self, *orig;
00226 
00227     Data_Get_Struct(vself, struct strscanner, self);
00228     check_strscan(vorig);
00229     Data_Get_Struct(vorig, struct strscanner, orig);
00230     if (self != orig) {
00231         self->flags = orig->flags;
00232         self->str = orig->str;
00233         self->prev = orig->prev;
00234         self->curr = orig->curr;
00235         onig_region_copy(&self->regs, &orig->regs);
00236     }
00237 
00238     return vself;
00239 }
00240 
00241 /* =======================================================================
00242                           Instance Methods
00243    ======================================================================= */
00244 
00245 /*
00246  * call-seq: StringScanner.must_C_version
00247  *
00248  * This method is defined for backward compatibility.
00249  */
00250 static VALUE
00251 strscan_s_mustc(VALUE self)
00252 {
00253     return self;
00254 }
00255 
00256 /*
00257  * Reset the scan pointer (index 0) and clear matching data.
00258  */
00259 static VALUE
00260 strscan_reset(VALUE self)
00261 {
00262     struct strscanner *p;
00263 
00264     GET_SCANNER(self, p);
00265     p->curr = 0;
00266     CLEAR_MATCH_STATUS(p);
00267     return self;
00268 }
00269 
00270 /*
00271  * call-seq:
00272  *   terminate
00273  *   clear
00274  *
00275  * Set the scan pointer to the end of the string and clear matching data.
00276  */
00277 static VALUE
00278 strscan_terminate(VALUE self)
00279 {
00280     struct strscanner *p;
00281 
00282     GET_SCANNER(self, p);
00283     p->curr = S_LEN(p);
00284     CLEAR_MATCH_STATUS(p);
00285     return self;
00286 }
00287 
00288 /*
00289  * Equivalent to #terminate.
00290  * This method is obsolete; use #terminate instead.
00291  */
00292 static VALUE
00293 strscan_clear(VALUE self)
00294 {
00295     rb_warning("StringScanner#clear is obsolete; use #terminate instead");
00296     return strscan_terminate(self);
00297 }
00298 
00299 /*
00300  * Returns the string being scanned.
00301  */
00302 static VALUE
00303 strscan_get_string(VALUE self)
00304 {
00305     struct strscanner *p;
00306 
00307     GET_SCANNER(self, p);
00308     return p->str;
00309 }
00310 
00311 /*
00312  * call-seq: string=(str)
00313  *
00314  * Changes the string being scanned to +str+ and resets the scanner.
00315  * Returns +str+.
00316  */
00317 static VALUE
00318 strscan_set_string(VALUE self, VALUE str)
00319 {
00320     struct strscanner *p;
00321 
00322     Data_Get_Struct(self, struct strscanner, p);
00323     StringValue(str);
00324     p->str = str;
00325     p->curr = 0;
00326     CLEAR_MATCH_STATUS(p);
00327     return str;
00328 }
00329 
00330 /*
00331  * call-seq:
00332  *   concat(str)
00333  *   <<(str)
00334  *
00335  * Appends +str+ to the string being scanned.
00336  * This method does not affect scan pointer.
00337  *
00338  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00339  *   s.scan(/Fri /)
00340  *   s << " +1000 GMT"
00341  *   s.string            # -> "Fri Dec 12 1975 14:39 +1000 GMT"
00342  *   s.scan(/Dec/)       # -> "Dec"
00343  */
00344 static VALUE
00345 strscan_concat(VALUE self, VALUE str)
00346 {
00347     struct strscanner *p;
00348 
00349     GET_SCANNER(self, p);
00350     StringValue(str);
00351     rb_str_append(p->str, str);
00352     return self;
00353 }
00354 
00355 /*
00356  * Returns the byte position of the scan pointer.  In the 'reset' position, this
00357  * value is zero.  In the 'terminated' position (i.e. the string is exhausted),
00358  * this value is the bytesize of the string.
00359  *
00360  * In short, it's a 0-based index into the string.
00361  *
00362  *   s = StringScanner.new('test string')
00363  *   s.pos               # -> 0
00364  *   s.scan_until /str/  # -> "test str"
00365  *   s.pos               # -> 8
00366  *   s.terminate         # -> #<StringScanner fin>
00367  *   s.pos               # -> 11
00368  */
00369 static VALUE
00370 strscan_get_pos(VALUE self)
00371 {
00372     struct strscanner *p;
00373 
00374     GET_SCANNER(self, p);
00375     return INT2FIX(p->curr);
00376 }
00377 
00378 /*
00379  * call-seq: pos=(n)
00380  *
00381  * Set the byte position of the scan pointer.
00382  *
00383  *   s = StringScanner.new('test string')
00384  *   s.pos = 7            # -> 7
00385  *   s.rest               # -> "ring"
00386  */
00387 static VALUE
00388 strscan_set_pos(VALUE self, VALUE v)
00389 {
00390     struct strscanner *p;
00391     long i;
00392 
00393     GET_SCANNER(self, p);
00394     i = NUM2INT(v);
00395     if (i < 0) i += S_LEN(p);
00396     if (i < 0) rb_raise(rb_eRangeError, "index out of range");
00397     if (i > S_LEN(p)) rb_raise(rb_eRangeError, "index out of range");
00398     p->curr = i;
00399     return INT2NUM(i);
00400 }
00401 
00402 static VALUE
00403 strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly)
00404 {
00405     regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
00406     struct strscanner *p;
00407     regex_t *re;
00408     int ret;
00409     int tmpreg;
00410 
00411     Check_Type(regex, T_REGEXP);
00412     GET_SCANNER(self, p);
00413 
00414     CLEAR_MATCH_STATUS(p);
00415     if (S_RESTLEN(p) < 0) {
00416         return Qnil;
00417     }
00418     re = rb_reg_prepare_re(regex, p->str);
00419     tmpreg = re != RREGEXP(regex)->ptr;
00420     if (!tmpreg) RREGEXP(regex)->usecnt++;
00421 
00422     if (headonly) {
00423         ret = onig_match(re, (UChar* )CURPTR(p),
00424                          (UChar* )(CURPTR(p) + S_RESTLEN(p)),
00425                          (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE);
00426     }
00427     else {
00428         ret = onig_search(re,
00429                           (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
00430                           (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
00431                           &(p->regs), ONIG_OPTION_NONE);
00432     }
00433     if (!tmpreg) RREGEXP(regex)->usecnt--;
00434     if (tmpreg) {
00435         if (RREGEXP(regex)->usecnt) {
00436             onig_free(re);
00437         }
00438         else {
00439             onig_free(RREGEXP(regex)->ptr);
00440             RREGEXP(regex)->ptr = re;
00441         }
00442     }
00443 
00444     if (ret == -2) rb_raise(ScanError, "regexp buffer overflow");
00445     if (ret < 0) {
00446         /* not matched */
00447         return Qnil;
00448     }
00449 
00450     MATCHED(p);
00451     p->prev = p->curr;
00452     if (succptr) {
00453         p->curr += p->regs.end[0];
00454     }
00455     if (getstr) {
00456         return extract_beg_len(p, p->prev, p->regs.end[0]);
00457     }
00458     else {
00459         return INT2FIX(p->regs.end[0]);
00460     }
00461 }
00462 
00463 /*
00464  * call-seq: scan(pattern) => String
00465  *
00466  * Tries to match with +pattern+ at the current position. If there's a match,
00467  * the scanner advances the "scan pointer" and returns the matched string.
00468  * Otherwise, the scanner returns +nil+.
00469  *
00470  *   s = StringScanner.new('test string')
00471  *   p s.scan(/\w+/)   # -> "test"
00472  *   p s.scan(/\w+/)   # -> nil
00473  *   p s.scan(/\s+/)   # -> " "
00474  *   p s.scan(/\w+/)   # -> "string"
00475  *   p s.scan(/./)     # -> nil
00476  *
00477  */
00478 static VALUE
00479 strscan_scan(VALUE self, VALUE re)
00480 {
00481     return strscan_do_scan(self, re, 1, 1, 1);
00482 }
00483 
00484 /*
00485  * call-seq: match?(pattern)
00486  *
00487  * Tests whether the given +pattern+ is matched from the current scan pointer.
00488  * Returns the length of the match, or +nil+.  The scan pointer is not advanced.
00489  *
00490  *   s = StringScanner.new('test string')
00491  *   p s.match?(/\w+/)   # -> 4
00492  *   p s.match?(/\w+/)   # -> 4
00493  *   p s.match?(/\s+/)   # -> nil
00494  */
00495 static VALUE
00496 strscan_match_p(VALUE self, VALUE re)
00497 {
00498     return strscan_do_scan(self, re, 0, 0, 1);
00499 }
00500 
00501 /*
00502  * call-seq: skip(pattern)
00503  *
00504  * Attempts to skip over the given +pattern+ beginning with the scan pointer.
00505  * If it matches, the scan pointer is advanced to the end of the match, and the
00506  * length of the match is returned.  Otherwise, +nil+ is returned.
00507  *
00508  * It's similar to #scan, but without returning the matched string.
00509  *
00510  *   s = StringScanner.new('test string')
00511  *   p s.skip(/\w+/)   # -> 4
00512  *   p s.skip(/\w+/)   # -> nil
00513  *   p s.skip(/\s+/)   # -> 1
00514  *   p s.skip(/\w+/)   # -> 6
00515  *   p s.skip(/./)     # -> nil
00516  *
00517  */
00518 static VALUE
00519 strscan_skip(VALUE self, VALUE re)
00520 {
00521     return strscan_do_scan(self, re, 1, 0, 1);
00522 }
00523 
00524 /*
00525  * call-seq: check(pattern)
00526  *
00527  * This returns the value that #scan would return, without advancing the scan
00528  * pointer.  The match register is affected, though.
00529  *
00530  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00531  *   s.check /Fri/               # -> "Fri"
00532  *   s.pos                       # -> 0
00533  *   s.matched                   # -> "Fri"
00534  *   s.check /12/                # -> nil
00535  *   s.matched                   # -> nil
00536  *
00537  * Mnemonic: it "checks" to see whether a #scan will return a value.
00538  */
00539 static VALUE
00540 strscan_check(VALUE self, VALUE re)
00541 {
00542     return strscan_do_scan(self, re, 0, 1, 1);
00543 }
00544 
00545 /*
00546  * call-seq: scan_full(pattern, advance_pointer_p, return_string_p)
00547  *
00548  * Tests whether the given +pattern+ is matched from the current scan pointer.
00549  * Advances the scan pointer if +advance_pointer_p+ is true.
00550  * Returns the matched string if +return_string_p+ is true.
00551  * The match register is affected.
00552  *
00553  * "full" means "#scan with full parameters".
00554  */
00555 static VALUE
00556 strscan_scan_full(VALUE self, VALUE re, VALUE s, VALUE f)
00557 {
00558     return strscan_do_scan(self, re, RTEST(s), RTEST(f), 1);
00559 }
00560 
00561 /*
00562  * call-seq: scan_until(pattern)
00563  *
00564  * Scans the string _until_ the +pattern+ is matched.  Returns the substring up
00565  * to and including the end of the match, advancing the scan pointer to that
00566  * location. If there is no match, +nil+ is returned.
00567  *
00568  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00569  *   s.scan_until(/1/)        # -> "Fri Dec 1"
00570  *   s.pre_match              # -> "Fri Dec "
00571  *   s.scan_until(/XYZ/)      # -> nil
00572  */
00573 static VALUE
00574 strscan_scan_until(VALUE self, VALUE re)
00575 {
00576     return strscan_do_scan(self, re, 1, 1, 0);
00577 }
00578 
00579 /*
00580  * call-seq: exist?(pattern)
00581  *
00582  * Looks _ahead_ to see if the +pattern+ exists _anywhere_ in the string,
00583  * without advancing the scan pointer.  This predicates whether a #scan_until
00584  * will return a value.
00585  *
00586  *   s = StringScanner.new('test string')
00587  *   s.exist? /s/            # -> 3
00588  *   s.scan /test/           # -> "test"
00589  *   s.exist? /s/            # -> 2
00590  *   s.exist? /e/            # -> nil
00591  */
00592 static VALUE
00593 strscan_exist_p(VALUE self, VALUE re)
00594 {
00595     return strscan_do_scan(self, re, 0, 0, 0);
00596 }
00597 
00598 /*
00599  * call-seq: skip_until(pattern)
00600  *
00601  * Advances the scan pointer until +pattern+ is matched and consumed.  Returns
00602  * the number of bytes advanced, or +nil+ if no match was found.
00603  *
00604  * Look ahead to match +pattern+, and advance the scan pointer to the _end_
00605  * of the match.  Return the number of characters advanced, or +nil+ if the
00606  * match was unsuccessful.
00607  *
00608  * It's similar to #scan_until, but without returning the intervening string.
00609  *
00610  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00611  *   s.skip_until /12/           # -> 10
00612  *   s                           #
00613  */
00614 static VALUE
00615 strscan_skip_until(VALUE self, VALUE re)
00616 {
00617     return strscan_do_scan(self, re, 1, 0, 0);
00618 }
00619 
00620 /*
00621  * call-seq: check_until(pattern)
00622  *
00623  * This returns the value that #scan_until would return, without advancing the
00624  * scan pointer.  The match register is affected, though.
00625  *
00626  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00627  *   s.check_until /12/          # -> "Fri Dec 12"
00628  *   s.pos                       # -> 0
00629  *   s.matched                   # -> 12
00630  *
00631  * Mnemonic: it "checks" to see whether a #scan_until will return a value.
00632  */
00633 static VALUE
00634 strscan_check_until(VALUE self, VALUE re)
00635 {
00636     return strscan_do_scan(self, re, 0, 1, 0);
00637 }
00638 
00639 /*
00640  * call-seq: search_full(pattern, advance_pointer_p, return_string_p)
00641  *
00642  * Scans the string _until_ the +pattern+ is matched.
00643  * Advances the scan pointer if +advance_pointer_p+, otherwise not.
00644  * Returns the matched string if +return_string_p+ is true, otherwise
00645  * returns the number of bytes advanced.
00646  * This method does affect the match register.
00647  */
00648 static VALUE
00649 strscan_search_full(VALUE self, VALUE re, VALUE s, VALUE f)
00650 {
00651     return strscan_do_scan(self, re, RTEST(s), RTEST(f), 0);
00652 }
00653 
00654 static void
00655 adjust_registers_to_matched(struct strscanner *p)
00656 {
00657     onig_region_clear(&(p->regs));
00658     onig_region_set(&(p->regs), 0, 0, p->curr - p->prev);
00659 }
00660 
00661 /*
00662  * Scans one character and returns it.
00663  * This method is multibyte character sensitive.
00664  *
00665  *   s = StringScanner.new("ab")
00666  *   s.getch           # => "a"
00667  *   s.getch           # => "b"
00668  *   s.getch           # => nil
00669  *
00670  *   $KCODE = 'EUC'
00671  *   s = StringScanner.new("\244\242")
00672  *   s.getch           # => "\244\242"   # Japanese hira-kana "A" in EUC-JP
00673  *   s.getch           # => nil
00674  */
00675 static VALUE
00676 strscan_getch(VALUE self)
00677 {
00678     struct strscanner *p;
00679     long len;
00680 
00681     GET_SCANNER(self, p);
00682     CLEAR_MATCH_STATUS(p);
00683     if (EOS_P(p))
00684         return Qnil;
00685 
00686     len = rb_enc_mbclen(CURPTR(p), S_PEND(p), rb_enc_get(p->str));
00687     if (p->curr + len > S_LEN(p)) {
00688         len = S_LEN(p) - p->curr;
00689     }
00690     p->prev = p->curr;
00691     p->curr += len;
00692     MATCHED(p);
00693     adjust_registers_to_matched(p);
00694     return extract_range(p, p->prev + p->regs.beg[0],
00695                             p->prev + p->regs.end[0]);
00696 }
00697 
00698 /*
00699  * Scans one byte and returns it.
00700  * This method is not multibyte character sensitive.
00701  * See also: #getch.
00702  *
00703  *   s = StringScanner.new('ab')
00704  *   s.get_byte         # => "a"
00705  *   s.get_byte         # => "b"
00706  *   s.get_byte         # => nil
00707  *
00708  *   $KCODE = 'EUC'
00709  *   s = StringScanner.new("\244\242")
00710  *   s.get_byte         # => "\244"
00711  *   s.get_byte         # => "\242"
00712  *   s.get_byte         # => nil
00713  */
00714 static VALUE
00715 strscan_get_byte(VALUE self)
00716 {
00717     struct strscanner *p;
00718 
00719     GET_SCANNER(self, p);
00720     CLEAR_MATCH_STATUS(p);
00721     if (EOS_P(p))
00722         return Qnil;
00723 
00724     p->prev = p->curr;
00725     p->curr++;
00726     MATCHED(p);
00727     adjust_registers_to_matched(p);
00728     return extract_range(p, p->prev + p->regs.beg[0],
00729                             p->prev + p->regs.end[0]);
00730 }
00731 
00732 /*
00733  * Equivalent to #get_byte.
00734  * This method is obsolete; use #get_byte instead.
00735  */
00736 static VALUE
00737 strscan_getbyte(VALUE self)
00738 {
00739     rb_warning("StringScanner#getbyte is obsolete; use #get_byte instead");
00740     return strscan_get_byte(self);
00741 }
00742 
00743 /*
00744  * call-seq: peek(len)
00745  *
00746  * Extracts a string corresponding to <tt>string[pos,len]</tt>, without
00747  * advancing the scan pointer.
00748  *
00749  *   s = StringScanner.new('test string')
00750  *   s.peek(7)          # => "test st"
00751  *   s.peek(7)          # => "test st"
00752  *
00753  */
00754 static VALUE
00755 strscan_peek(VALUE self, VALUE vlen)
00756 {
00757     struct strscanner *p;
00758     long len;
00759 
00760     GET_SCANNER(self, p);
00761 
00762     len = NUM2LONG(vlen);
00763     if (EOS_P(p))
00764         return infect(str_new(p, "", 0), p);
00765 
00766     if (p->curr + len > S_LEN(p))
00767         len = S_LEN(p) - p->curr;
00768     return extract_beg_len(p, p->curr, len);
00769 }
00770 
00771 /*
00772  * Equivalent to #peek.
00773  * This method is obsolete; use #peek instead.
00774  */
00775 static VALUE
00776 strscan_peep(VALUE self, VALUE vlen)
00777 {
00778     rb_warning("StringScanner#peep is obsolete; use #peek instead");
00779     return strscan_peek(self, vlen);
00780 }
00781 
00782 /*
00783  * Set the scan pointer to the previous position.  Only one previous position is
00784  * remembered, and it changes with each scanning operation.
00785  *
00786  *   s = StringScanner.new('test string')
00787  *   s.scan(/\w+/)        # => "test"
00788  *   s.unscan
00789  *   s.scan(/../)         # => "te"
00790  *   s.scan(/\d/)         # => nil
00791  *   s.unscan             # ScanError: unscan failed: previous match record not exist
00792  */
00793 static VALUE
00794 strscan_unscan(VALUE self)
00795 {
00796     struct strscanner *p;
00797 
00798     GET_SCANNER(self, p);
00799     if (! MATCHED_P(p))
00800         rb_raise(ScanError, "unscan failed: previous match record not exist");
00801     p->curr = p->prev;
00802     CLEAR_MATCH_STATUS(p);
00803     return self;
00804 }
00805 
00806 /*
00807  * Returns +true+ iff the scan pointer is at the beginning of the line.
00808  *
00809  *   s = StringScanner.new("test\ntest\n")
00810  *   s.bol?           # => true
00811  *   s.scan(/te/)
00812  *   s.bol?           # => false
00813  *   s.scan(/st\n/)
00814  *   s.bol?           # => true
00815  *   s.terminate
00816  *   s.bol?           # => true
00817  */
00818 static VALUE
00819 strscan_bol_p(VALUE self)
00820 {
00821     struct strscanner *p;
00822 
00823     GET_SCANNER(self, p);
00824     if (CURPTR(p) > S_PEND(p)) return Qnil;
00825     if (p->curr == 0) return Qtrue;
00826     return (*(CURPTR(p) - 1) == '\n') ? Qtrue : Qfalse;
00827 }
00828 
00829 /*
00830  * Returns +true+ if the scan pointer is at the end of the string.
00831  *
00832  *   s = StringScanner.new('test string')
00833  *   p s.eos?          # => false
00834  *   s.scan(/test/)
00835  *   p s.eos?          # => false
00836  *   s.terminate
00837  *   p s.eos?          # => true
00838  */
00839 static VALUE
00840 strscan_eos_p(VALUE self)
00841 {
00842     struct strscanner *p;
00843 
00844     GET_SCANNER(self, p);
00845     return EOS_P(p) ? Qtrue : Qfalse;
00846 }
00847 
00848 /*
00849  * Equivalent to #eos?.
00850  * This method is obsolete, use #eos? instead.
00851  */
00852 static VALUE
00853 strscan_empty_p(VALUE self)
00854 {
00855     rb_warning("StringScanner#empty? is obsolete; use #eos? instead");
00856     return strscan_eos_p(self);
00857 }
00858 
00859 /*
00860  * Returns true iff there is more data in the string.  See #eos?.
00861  * This method is obsolete; use #eos? instead.
00862  *
00863  *   s = StringScanner.new('test string')
00864  *   s.eos?              # These two
00865  *   s.rest?             # are opposites.
00866  */
00867 static VALUE
00868 strscan_rest_p(VALUE self)
00869 {
00870     struct strscanner *p;
00871 
00872     GET_SCANNER(self, p);
00873     return EOS_P(p) ? Qfalse : Qtrue;
00874 }
00875 
00876 /*
00877  * Returns +true+ iff the last match was successful.
00878  *
00879  *   s = StringScanner.new('test string')
00880  *   s.match?(/\w+/)     # => 4
00881  *   s.matched?          # => true
00882  *   s.match?(/\d+/)     # => nil
00883  *   s.matched?          # => false
00884  */
00885 static VALUE
00886 strscan_matched_p(VALUE self)
00887 {
00888     struct strscanner *p;
00889 
00890     GET_SCANNER(self, p);
00891     return MATCHED_P(p) ? Qtrue : Qfalse;
00892 }
00893 
00894 /*
00895  * Returns the last matched string.
00896  *
00897  *   s = StringScanner.new('test string')
00898  *   s.match?(/\w+/)     # -> 4
00899  *   s.matched           # -> "test"
00900  */
00901 static VALUE
00902 strscan_matched(VALUE self)
00903 {
00904     struct strscanner *p;
00905 
00906     GET_SCANNER(self, p);
00907     if (! MATCHED_P(p)) return Qnil;
00908     return extract_range(p, p->prev + p->regs.beg[0],
00909                             p->prev + p->regs.end[0]);
00910 }
00911 
00912 /*
00913  * Returns the size of the most recent match (see #matched), or +nil+ if there
00914  * was no recent match.
00915  *
00916  *   s = StringScanner.new('test string')
00917  *   s.check /\w+/           # -> "test"
00918  *   s.matched_size          # -> 4
00919  *   s.check /\d+/           # -> nil
00920  *   s.matched_size          # -> nil
00921  */
00922 static VALUE
00923 strscan_matched_size(VALUE self)
00924 {
00925     struct strscanner *p;
00926 
00927     GET_SCANNER(self, p);
00928     if (! MATCHED_P(p)) return Qnil;
00929     return INT2NUM(p->regs.end[0] - p->regs.beg[0]);
00930 }
00931 
00932 /*
00933  * call-seq: [](n)
00934  *
00935  * Return the n-th subgroup in the most recent match.
00936  *
00937  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00938  *   s.scan(/(\w+) (\w+) (\d+) /)       # -> "Fri Dec 12 "
00939  *   s[0]                               # -> "Fri Dec 12 "
00940  *   s[1]                               # -> "Fri"
00941  *   s[2]                               # -> "Dec"
00942  *   s[3]                               # -> "12"
00943  *   s.post_match                       # -> "1975 14:39"
00944  *   s.pre_match                        # -> ""
00945  */
00946 static VALUE
00947 strscan_aref(VALUE self, VALUE idx)
00948 {
00949     struct strscanner *p;
00950     long i;
00951 
00952     GET_SCANNER(self, p);
00953     if (! MATCHED_P(p))        return Qnil;
00954 
00955     i = NUM2LONG(idx);
00956     if (i < 0)
00957         i += p->regs.num_regs;
00958     if (i < 0)                 return Qnil;
00959     if (i >= p->regs.num_regs) return Qnil;
00960     if (p->regs.beg[i] == -1)  return Qnil;
00961 
00962     return extract_range(p, p->prev + p->regs.beg[i],
00963                             p->prev + p->regs.end[i]);
00964 }
00965 
00966 /*
00967  * Return the <i><b>pre</b>-match</i> (in the regular expression sense) of the last scan.
00968  *
00969  *   s = StringScanner.new('test string')
00970  *   s.scan(/\w+/)           # -> "test"
00971  *   s.scan(/\s+/)           # -> " "
00972  *   s.pre_match             # -> "test"
00973  *   s.post_match            # -> "string"
00974  */
00975 static VALUE
00976 strscan_pre_match(VALUE self)
00977 {
00978     struct strscanner *p;
00979 
00980     GET_SCANNER(self, p);
00981     if (! MATCHED_P(p)) return Qnil;
00982     return extract_range(p, 0, p->prev + p->regs.beg[0]);
00983 }
00984 
00985 /*
00986  * Return the <i><b>post</b>-match</i> (in the regular expression sense) of the last scan.
00987  *
00988  *   s = StringScanner.new('test string')
00989  *   s.scan(/\w+/)           # -> "test"
00990  *   s.scan(/\s+/)           # -> " "
00991  *   s.pre_match             # -> "test"
00992  *   s.post_match            # -> "string"
00993  */
00994 static VALUE
00995 strscan_post_match(VALUE self)
00996 {
00997     struct strscanner *p;
00998 
00999     GET_SCANNER(self, p);
01000     if (! MATCHED_P(p)) return Qnil;
01001     return extract_range(p, p->prev + p->regs.end[0], S_LEN(p));
01002 }
01003 
01004 /*
01005  * Returns the "rest" of the string (i.e. everything after the scan pointer).
01006  * If there is no more data (eos? = true), it returns <tt>""</tt>.
01007  */
01008 static VALUE
01009 strscan_rest(VALUE self)
01010 {
01011     struct strscanner *p;
01012 
01013     GET_SCANNER(self, p);
01014     if (EOS_P(p)) {
01015         return infect(str_new(p, "", 0), p);
01016     }
01017     return extract_range(p, p->curr, S_LEN(p));
01018 }
01019 
01020 /*
01021  * <tt>s.rest_size</tt> is equivalent to <tt>s.rest.size</tt>.
01022  */
01023 static VALUE
01024 strscan_rest_size(VALUE self)
01025 {
01026     struct strscanner *p;
01027     long i;
01028 
01029     GET_SCANNER(self, p);
01030     if (EOS_P(p)) {
01031         return INT2FIX(0);
01032     }
01033     i = S_LEN(p) - p->curr;
01034     return INT2FIX(i);
01035 }
01036 
01037 /*
01038  * <tt>s.restsize</tt> is equivalent to <tt>s.rest_size</tt>.
01039  * This method is obsolete; use #rest_size instead.
01040  */
01041 static VALUE
01042 strscan_restsize(VALUE self)
01043 {
01044     rb_warning("StringScanner#restsize is obsolete; use #rest_size instead");
01045     return strscan_rest_size(self);
01046 }
01047 
01048 #define INSPECT_LENGTH 5
01049 #define BUFSIZE 256
01050 
01051 /*
01052  * Returns a string that represents the StringScanner object, showing:
01053  * - the current position
01054  * - the size of the string
01055  * - the characters surrounding the scan pointer
01056  *
01057  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
01058  *   s.inspect            # -> '#<StringScanner 0/21 @ "Fri D...">'
01059  *   s.scan_until /12/    # -> "Fri Dec 12"
01060  *   s.inspect            # -> '#<StringScanner 10/21 "...ec 12" @ " 1975...">'
01061  */
01062 static VALUE
01063 strscan_inspect(VALUE self)
01064 {
01065     struct strscanner *p;
01066     char buf[BUFSIZE];
01067     long len;
01068     VALUE a, b;
01069 
01070     Data_Get_Struct(self, struct strscanner, p);
01071     if (NIL_P(p->str)) {
01072         len = snprintf(buf, BUFSIZE, "#<%s (uninitialized)>",
01073                        rb_class2name(CLASS_OF(self)));
01074         return infect(rb_str_new(buf, len), p);
01075     }
01076     if (EOS_P(p)) {
01077         len = snprintf(buf, BUFSIZE, "#<%s fin>",
01078                        rb_class2name(CLASS_OF(self)));
01079         return infect(rb_str_new(buf, len), p);
01080     }
01081     if (p->curr == 0) {
01082         b = inspect2(p);
01083         len = snprintf(buf, BUFSIZE, "#<%s %ld/%ld @ %s>",
01084                        rb_class2name(CLASS_OF(self)),
01085                        p->curr, S_LEN(p),
01086                        RSTRING_PTR(b));
01087         return infect(rb_str_new(buf, len), p);
01088     }
01089     a = inspect1(p);
01090     b = inspect2(p);
01091     len = snprintf(buf, BUFSIZE, "#<%s %ld/%ld %s @ %s>",
01092                    rb_class2name(CLASS_OF(self)),
01093                    p->curr, S_LEN(p),
01094                    RSTRING_PTR(a),
01095                    RSTRING_PTR(b));
01096     return infect(rb_str_new(buf, len), p);
01097 }
01098 
01099 static VALUE
01100 inspect1(struct strscanner *p)
01101 {
01102     char buf[BUFSIZE];
01103     char *bp = buf;
01104     long len;
01105 
01106     if (p->curr == 0) return rb_str_new2("");
01107     if (p->curr > INSPECT_LENGTH) {
01108         strcpy(bp, "..."); bp += 3;
01109         len = INSPECT_LENGTH;
01110     }
01111     else {
01112         len = p->curr;
01113     }
01114     memcpy(bp, CURPTR(p) - len, len); bp += len;
01115     return rb_str_dump(rb_str_new(buf, bp - buf));
01116 }
01117 
01118 static VALUE
01119 inspect2(struct strscanner *p)
01120 {
01121     char buf[BUFSIZE];
01122     char *bp = buf;
01123     long len;
01124 
01125     if (EOS_P(p)) return rb_str_new2("");
01126     len = S_LEN(p) - p->curr;
01127     if (len > INSPECT_LENGTH) {
01128         len = INSPECT_LENGTH;
01129         memcpy(bp, CURPTR(p), len); bp += len;
01130         strcpy(bp, "..."); bp += 3;
01131     }
01132     else {
01133         memcpy(bp, CURPTR(p), len); bp += len;
01134     }
01135     return rb_str_dump(rb_str_new(buf, bp - buf));
01136 }
01137 
01138 /* =======================================================================
01139                               Ruby Interface
01140    ======================================================================= */
01141 
01142 /*
01143  * Document-class: StringScanner
01144  *
01145  * StringScanner provides for lexical scanning operations on a String.  Here is
01146  * an example of its usage:
01147  *
01148  *   s = StringScanner.new('This is an example string')
01149  *   s.eos?               # -> false
01150  *
01151  *   p s.scan(/\w+/)      # -> "This"
01152  *   p s.scan(/\w+/)      # -> nil
01153  *   p s.scan(/\s+/)      # -> " "
01154  *   p s.scan(/\s+/)      # -> nil
01155  *   p s.scan(/\w+/)      # -> "is"
01156  *   s.eos?               # -> false
01157  *
01158  *   p s.scan(/\s+/)      # -> " "
01159  *   p s.scan(/\w+/)      # -> "an"
01160  *   p s.scan(/\s+/)      # -> " "
01161  *   p s.scan(/\w+/)      # -> "example"
01162  *   p s.scan(/\s+/)      # -> " "
01163  *   p s.scan(/\w+/)      # -> "string"
01164  *   s.eos?               # -> true
01165  *
01166  *   p s.scan(/\s+/)      # -> nil
01167  *   p s.scan(/\w+/)      # -> nil
01168  *
01169  * Scanning a string means remembering the position of a <i>scan pointer</i>,
01170  * which is just an index.  The point of scanning is to move forward a bit at
01171  * a time, so matches are sought after the scan pointer; usually immediately
01172  * after it.
01173  *
01174  * Given the string "test string", here are the pertinent scan pointer
01175  * positions:
01176  *
01177  *     t e s t   s t r i n g
01178  *   0 1 2 ...             1
01179  *                         0
01180  *
01181  * When you #scan for a pattern (a regular expression), the match must occur
01182  * at the character after the scan pointer.  If you use #scan_until, then the
01183  * match can occur anywhere after the scan pointer.  In both cases, the scan
01184  * pointer moves <i>just beyond</i> the last character of the match, ready to
01185  * scan again from the next character onwards.  This is demonstrated by the
01186  * example above.
01187  *
01188  * == Method Categories
01189  *
01190  * There are other methods besides the plain scanners.  You can look ahead in
01191  * the string without actually scanning.  You can access the most recent match.
01192  * You can modify the string being scanned, reset or terminate the scanner,
01193  * find out or change the position of the scan pointer, skip ahead, and so on.
01194  *
01195  * === Advancing the Scan Pointer
01196  *
01197  * - #getch
01198  * - #get_byte
01199  * - #scan
01200  * - #scan_until
01201  * - #skip
01202  * - #skip_until
01203  *
01204  * === Looking Ahead
01205  *
01206  * - #check
01207  * - #check_until
01208  * - #exist?
01209  * - #match?
01210  * - #peek
01211  *
01212  * === Finding Where we Are
01213  *
01214  * - #beginning_of_line? (#bol?)
01215  * - #eos?
01216  * - #rest?
01217  * - #rest_size
01218  * - #pos
01219  *
01220  * === Setting Where we Are
01221  *
01222  * - #reset
01223  * - #terminate
01224  * - #pos=
01225  *
01226  * === Match Data
01227  *
01228  * - #matched
01229  * - #matched?
01230  * - #matched_size
01231  * - []
01232  * - #pre_match
01233  * - #post_match
01234  *
01235  * === Miscellaneous
01236  *
01237  * - <<
01238  * - #concat
01239  * - #string
01240  * - #string=
01241  * - #unscan
01242  *
01243  * There are aliases to several of the methods.
01244  */
01245 void
01246 Init_strscan()
01247 {
01248     ID id_scanerr = rb_intern("ScanError");
01249     VALUE tmp;
01250 
01251     StringScanner = rb_define_class("StringScanner", rb_cObject);
01252     ScanError = rb_define_class_under(StringScanner, "Error", rb_eStandardError);
01253     if (!rb_const_defined(rb_cObject, id_scanerr)) {
01254         rb_const_set(rb_cObject, id_scanerr, ScanError);
01255     }
01256     tmp = rb_str_new2(STRSCAN_VERSION);
01257     rb_obj_freeze(tmp);
01258     rb_const_set(StringScanner, rb_intern("Version"), tmp);
01259     tmp = rb_str_new2("$Id: strscan.c 27437 2010-04-22 08:04:13Z nobu $");
01260     rb_obj_freeze(tmp);
01261     rb_const_set(StringScanner, rb_intern("Id"), tmp);
01262 
01263     rb_define_alloc_func(StringScanner, strscan_s_allocate);
01264     rb_define_private_method(StringScanner, "initialize", strscan_initialize, -1);
01265     rb_define_private_method(StringScanner, "initialize_copy", strscan_init_copy, 1);
01266     rb_define_singleton_method(StringScanner, "must_C_version", strscan_s_mustc, 0);
01267     rb_define_method(StringScanner, "reset",       strscan_reset,       0);
01268     rb_define_method(StringScanner, "terminate",   strscan_terminate,   0);
01269     rb_define_method(StringScanner, "clear",       strscan_clear,       0);
01270     rb_define_method(StringScanner, "string",      strscan_get_string,  0);
01271     rb_define_method(StringScanner, "string=",     strscan_set_string,  1);
01272     rb_define_method(StringScanner, "concat",      strscan_concat,      1);
01273     rb_define_method(StringScanner, "<<",          strscan_concat,      1);
01274     rb_define_method(StringScanner, "pos",         strscan_get_pos,     0);
01275     rb_define_method(StringScanner, "pos=",        strscan_set_pos,     1);
01276     rb_define_method(StringScanner, "pointer",     strscan_get_pos,     0);
01277     rb_define_method(StringScanner, "pointer=",    strscan_set_pos,     1);
01278 
01279     rb_define_method(StringScanner, "scan",        strscan_scan,        1);
01280     rb_define_method(StringScanner, "skip",        strscan_skip,        1);
01281     rb_define_method(StringScanner, "match?",      strscan_match_p,     1);
01282     rb_define_method(StringScanner, "check",       strscan_check,       1);
01283     rb_define_method(StringScanner, "scan_full",   strscan_scan_full,   3);
01284 
01285     rb_define_method(StringScanner, "scan_until",  strscan_scan_until,  1);
01286     rb_define_method(StringScanner, "skip_until",  strscan_skip_until,  1);
01287     rb_define_method(StringScanner, "exist?",      strscan_exist_p,     1);
01288     rb_define_method(StringScanner, "check_until", strscan_check_until, 1);
01289     rb_define_method(StringScanner, "search_full", strscan_search_full, 3);
01290 
01291     rb_define_method(StringScanner, "getch",       strscan_getch,       0);
01292     rb_define_method(StringScanner, "get_byte",    strscan_get_byte,    0);
01293     rb_define_method(StringScanner, "getbyte",     strscan_getbyte,     0);
01294     rb_define_method(StringScanner, "peek",        strscan_peek,        1);
01295     rb_define_method(StringScanner, "peep",        strscan_peep,        1);
01296 
01297     rb_define_method(StringScanner, "unscan",      strscan_unscan,      0);
01298 
01299     rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0);
01300     rb_alias(StringScanner, rb_intern("bol?"), rb_intern("beginning_of_line?"));
01301     rb_define_method(StringScanner, "eos?",        strscan_eos_p,       0);
01302     rb_define_method(StringScanner, "empty?",      strscan_empty_p,     0);
01303     rb_define_method(StringScanner, "rest?",       strscan_rest_p,      0);
01304 
01305     rb_define_method(StringScanner, "matched?",    strscan_matched_p,   0);
01306     rb_define_method(StringScanner, "matched",     strscan_matched,     0);
01307     rb_define_method(StringScanner, "matched_size", strscan_matched_size, 0);
01308     rb_define_method(StringScanner, "[]",          strscan_aref,        1);
01309     rb_define_method(StringScanner, "pre_match",   strscan_pre_match,   0);
01310     rb_define_method(StringScanner, "post_match",  strscan_post_match,  0);
01311 
01312     rb_define_method(StringScanner, "rest",        strscan_rest,        0);
01313     rb_define_method(StringScanner, "rest_size",   strscan_rest_size,   0);
01314     rb_define_method(StringScanner, "restsize",    strscan_restsize,    0);
01315 
01316     rb_define_method(StringScanner, "inspect",     strscan_inspect,     0);
01317 }
01318 

Generated on Sat Jul 7 2012 15:29:15 for Ruby by  doxygen 1.7.1