WvStreams
wvtclstring.cc
00001 /*
00002  * Worldvisions Weaver Software:
00003  *   Copyright (C) 1997-2002 Net Integration Technologies, Inc.
00004  */
00005 #include "wvbackslash.h"
00006 #include "wvbuf.h"
00007 #include "wvstream.h"
00008 #include "wvstring.h"
00009 #include "wvstringmask.h"
00010 #include "wvtclstring.h"
00011 #include <climits>
00012 
00013 const WvStringMask WVTCL_NASTY_SPACES(WVTCL_NASTY_SPACES_STR);
00014 const WvStringMask WVTCL_NASTY_NEWLINES(WVTCL_NASTY_NEWLINES_STR);
00015 const WvStringMask WVTCL_SPLITCHARS(WVTCL_SPLITCHARS_STR);
00016 
00017 static size_t wvtcl_escape(char *dst, const char *s, size_t s_len,
00018                            const WvStringMask &nasties, bool *verbatim = NULL)
00019 {
00020     if (verbatim) *verbatim = false;
00021 
00022     // NULL strings remain such
00023     if (s == NULL)
00024         return 0;
00025     // empty strings are just {}
00026     if (s_len == 0)
00027     {
00028         if (dst)
00029         {
00030             dst[0] = '{';
00031             dst[1] = '}';
00032         }
00033         return 2;
00034     }
00035     
00036     bool backslashify = false, inescape = false;
00037     int len = 0, unprintables = 0, bracecount = 0;
00038     const char *cptr, *cptr_end = s + s_len;
00039     
00040     // figure out which method we need to use: backslashify or embrace.
00041     // also count the number of unprintable characters we'll need to 
00042     // backslashify, if it turns out that's necessary.
00043     for (cptr = s; cptr != cptr_end; cptr++)
00044     {
00045         // Assume we do nothing
00046         if (dst) dst[len] = *cptr;
00047         ++len;
00048 
00049         if (!inescape && *cptr == '{')
00050             bracecount++;
00051         else if (!inescape && *cptr == '}')
00052             bracecount--;
00053         if (bracecount < 0)
00054             backslashify = true;
00055 
00056         bool doit = false;
00057         switch (*cptr)
00058         {
00059         case WVTCL_ALWAYS_NASTY_CASE:
00060             doit = true;
00061             break;
00062         default:
00063             if (nasties[*cptr])
00064                 doit = true;
00065         }
00066         if (doit)
00067             unprintables++;
00068 
00069         if (*cptr == '\\')
00070             inescape = !inescape;
00071         else
00072             inescape = false;
00073     }
00074     
00075     // if the braces aren't balanced, backslashify
00076     if (bracecount != 0 || inescape)
00077         backslashify = true;
00078 
00079     if (!backslashify && !unprintables)
00080     {
00081         if (verbatim) *verbatim = true;
00082         return len; // no work needed!
00083     }
00084     
00085     if (backslashify)
00086     {
00087         if (dst)
00088         {
00089             len = 0;
00090             for (cptr = s; cptr != cptr_end; ++cptr)
00091             {
00092                 bool doit = false;
00093                 switch (*cptr)
00094                 {
00095                 case WVTCL_ALWAYS_NASTY_CASE:
00096                     doit = true;
00097                     break;
00098                 default:
00099                     if (nasties[*cptr])
00100                         doit = true;
00101                 }
00102                 if (doit)
00103                     dst[len++] = '\\';
00104 
00105                 dst[len++] = *cptr;
00106             }
00107             return len;
00108         }
00109         else return len+unprintables;
00110     }
00111     else
00112     {
00113         // the embrace method: just take the string and put braces around it
00114         if (dst)
00115         {
00116             len = 0;
00117             dst[len++] = '{';
00118             for (cptr = s; cptr != cptr_end; ++cptr)
00119                 dst[len++] = *cptr;
00120             dst[len++] = '}';
00121             return len;
00122         }
00123         else return len+2;
00124     }
00125 }
00126 
00127 
00128 WvString wvtcl_escape(WvStringParm s, const WvStringMask &nasties)
00129 {
00130     size_t s_len = s.len();
00131 
00132     bool verbatim;
00133     size_t len = wvtcl_escape(NULL, s, s_len, nasties, &verbatim);
00134     if (verbatim) return s;
00135 
00136     WvString result;
00137     result.setsize(len);
00138     char *e = result.edit();
00139     e += wvtcl_escape(e, s, s_len, nasties);
00140     *e = '\0';
00141     return result;
00142 }
00143 
00144 
00145 static size_t wvtcl_unescape(char *dst, const char *s, size_t s_len,
00146         bool *verbatim = NULL)
00147 {
00148     //printf("  unescape '%s'\n", (const char *)s);
00149     
00150     // empty or NULL strings remain themselves
00151     if (!s)
00152     {
00153         if (verbatim) *verbatim = true;
00154         return 0;
00155     }
00156 
00157     if (verbatim) *verbatim = false;
00158     
00159     // deal with embraced strings by simply removing the braces
00160     if (s[0] == '{' && s[s_len-1] == '}')
00161     {
00162         if (dst) memcpy(dst, &s[1], s_len-2);
00163         return s_len - 2;
00164     }
00165     
00166     bool skipquotes = false;
00167     // deal with quoted strings by ignoring the quotes _and_ unbackslashifying.
00168     if (s[0] == '"' && s[s_len-1] == '"')
00169         skipquotes = true;
00170     
00171     // otherwise, unbackslashify it.
00172     const char *start = s, *end = &s[s_len];
00173     if (skipquotes)
00174     {
00175         ++start;
00176         --end;
00177     }
00178     size_t len = 0;
00179     bool inescape = false;
00180     for (; start != end; ++start)
00181     {
00182         if (*start == '\\')
00183         {
00184             if (inescape)
00185             {
00186                 if (dst) dst[len] = *start;
00187                 len++;
00188                 inescape = false;
00189             }
00190             else
00191                 inescape = true;
00192         }
00193         else
00194         {
00195             inescape = false;
00196             if (dst) dst[len] = *start;
00197             len++;
00198         }
00199     }
00200     return len;
00201 }
00202 
00203 
00204 WvString wvtcl_unescape(WvStringParm s)
00205 {
00206     size_t s_len = s.len();
00207 
00208     bool verbatim;
00209     size_t len = wvtcl_unescape(NULL, s, s_len, &verbatim);
00210     if (verbatim) return s;
00211 
00212     WvString result;
00213     result.setsize(len+1);
00214     char *e = result.edit();
00215     e += wvtcl_unescape(e, s, s_len);
00216     *e = '\0';
00217     return result;
00218 }
00219 
00220 
00221 WvString wvtcl_encode(WvList<WvString> &l, const WvStringMask &nasties,
00222                       const WvStringMask &splitchars)
00223 {
00224     int size = 0;
00225 
00226     WvList<WvString>::Iter i(l);
00227     int count = 0;
00228     for (i.rewind(); i.next(); )
00229     {
00230         size += wvtcl_escape(NULL, *i, i->len(), nasties);
00231         ++count;
00232     }
00233     
00234     WvString result;
00235     result.setsize(size+(count-1)+1);
00236 
00237     char *p = result.edit();
00238     int j;
00239     for (i.rewind(), j=0; i.next(); ++j)
00240     {
00241         p += wvtcl_escape(p, *i, i->len(), nasties);
00242         if (j < count - 1)
00243             *p++ = splitchars.first();
00244     }
00245     *p = '\0';
00246     
00247     return result;
00248 }
00249 
00250 const size_t WVTCL_GETWORD_NONE (UINT_MAX);
00251 
00252 static size_t wvtcl_getword(char *dst, const char *s, size_t s_len,
00253                             const WvStringMask &splitchars,
00254                             bool do_unescape, size_t *end = NULL)
00255 {
00256     //printf("      used=%d\n", origsize);
00257     if (!s_len) return WVTCL_GETWORD_NONE;
00258 
00259     bool inescape = false, inquote = false, incontinuation = false;
00260     int bracecount = 0;
00261     const char *origend = s + s_len;
00262     const char *sptr, *eptr;
00263 
00264     // skip leading separators
00265     for (sptr = s; sptr != origend; sptr++)
00266     {
00267         if (!splitchars[*sptr])
00268             break;
00269     }
00270 
00271     if (sptr == origend) // nothing left
00272         return WVTCL_GETWORD_NONE;
00273 
00274     // detect initial quote
00275     if (*sptr == '"')
00276     {
00277         inquote = true;
00278         eptr = sptr+1;
00279     }
00280     else
00281         eptr = sptr;
00282     
00283     // loop over string until something satisfactory is found
00284     for (; eptr != origend; eptr++)
00285     {
00286         char ch = *eptr;
00287         
00288         incontinuation = false;
00289         
00290         if (inescape)
00291         {
00292             if (ch == '\n')
00293             {
00294                 // technically we've finished the line-continuation
00295                 // sequence, but we require at least one more character
00296                 // in order to prove that there's a next line somewhere
00297                 // in the buffer.  Otherwise we might stop parsing before
00298                 // we're "really" done if we're given input line-by-line.
00299                 // 
00300                 // A better way to do this would be for getword() to *never*
00301                 // return a string unless it contains a separator character;
00302                 // then we wouldn't need this weird special case.  But it
00303                 // don't work like that; we'll return the last word in the
00304                 // buffer even if it *doesn't* end in a separator character.
00305                 incontinuation = true;
00306             }
00307             inescape = false;
00308         }
00309         else if (ch == '\\')
00310         {
00311             inescape = true;
00312             // now we need a character to complete the escape
00313         }
00314         else // not an escape sequence
00315         {
00316             // detect end of a quoted/unquoted string
00317             if (bracecount == 0)
00318             {
00319                 if (inquote)
00320                 {
00321                     if (ch == '"')
00322                     {
00323                         eptr++;
00324                         break;
00325                     }
00326                 }
00327                 else if (splitchars[ch])
00328                     break;
00329             }
00330             
00331             // match braces
00332             if (!inquote)
00333             {
00334                 if (ch == '{')
00335                     bracecount++;
00336                 else if (bracecount > 0 && ch == '}')
00337                     bracecount--;
00338             }
00339         }
00340     }
00341     
00342     if (bracecount || sptr==eptr || inquote || inescape || incontinuation)
00343         // not there yet...
00344         return WVTCL_GETWORD_NONE;
00345 
00346     //printf("len=%d, unget=%d\n", eptr - sptr, origend - eptr);
00347     if (end) *end = eptr - s;
00348 
00349     if (do_unescape)
00350         return wvtcl_unescape(dst, sptr, eptr-sptr);
00351     else
00352     {
00353         if (dst) memcpy(dst, sptr, eptr-sptr);
00354         return eptr - sptr;
00355     }
00356 }
00357 
00358 
00359 WvString wvtcl_getword(WvBuf &buf, const WvStringMask &splitchars,
00360                        bool do_unescape)
00361 {
00362     int origsize = buf.used();
00363     const char *origptr = (const char *)buf.get(origsize);
00364 
00365     size_t end;
00366     size_t len = wvtcl_getword(NULL, origptr, origsize,
00367             splitchars, do_unescape, &end);
00368     if (len == WVTCL_GETWORD_NONE)
00369     {
00370         buf.unget(origsize);
00371         return WvString::null;
00372     }
00373 
00374     WvString result;
00375     result.setsize(len+1);
00376     char *e = result.edit();
00377     e += wvtcl_getword(e, origptr, origsize, splitchars, do_unescape);
00378     *e = '\0';
00379 
00380     buf.unget(origsize - end);
00381 
00382     return result;
00383 }
00384 
00385 
00386 void wvtcl_decode(WvList<WvString> &l, WvStringParm _s,
00387                   const WvStringMask &splitchars, bool do_unescape)
00388 {
00389     const char *s = _s;
00390     size_t s_len = _s.len();
00391     for (;;)
00392     {
00393         size_t end;
00394         size_t len = wvtcl_getword(NULL, s, s_len,
00395                 splitchars, do_unescape, &end);
00396         if (len == WVTCL_GETWORD_NONE)
00397             break;
00398 
00399         WvString *word = new WvString();
00400         word->setsize(len+1);
00401 
00402         char *e = word->edit();
00403         e += wvtcl_getword(e, s, s_len, splitchars, do_unescape);
00404         *e = '\0';
00405         l.append(word, true);
00406 
00407         s += end;
00408         s_len -= end;
00409     }
00410 }