Leptonica  1.83.1
Image processing and image analysis suite
encoding.c
1 /*====================================================================*
2  - Copyright (C) 2001 Leptonica. All rights reserved.
3  - This software is distributed in the hope that it will be
4  - useful, but with NO WARRANTY OF ANY KIND.
5  - No author or distributor accepts responsibility to anyone for the
6  - consequences of using this software, or for whether it serves any
7  - particular purpose or works at all, unless he or she says so in
8  - writing. Everyone is granted permission to copy, modify and
9  - redistribute this source code, for commercial or non-commercial
10  - purposes, with the following restrictions: (1) the origin of this
11  - source code must not be misrepresented; (2) modified versions must
12  - be plainly marked as such; and (3) this notice may not be removed
13  - or altered from any source or modified source distribution.
14  *====================================================================*/
15 
16 /*
17  * encodings.c
18  *
19  * Base64
20  * char *encodeBase64()
21  * l_uint8 *decodeBase64()
22  * static l_int32 isBase64()
23  * static l_int32 *genReverseTab64()
24  * static void byteConvert3to4()
25  * static void byteConvert4to3()
26  *
27  * Ascii85
28  * char *encodeAscii85()
29  * l_uint8 *decodeAscii85()
30  * static l_int32 convertChunkToAscii85()
31  *
32  * char *encodeAscii85WithComp()
33  * l_uint8 *decodeAscii85WithComp()
34  *
35  * String reformatting for base 64 encoded data
36  * char *reformatPacked64()
37  *
38  * Base64 encoding is useful for encding binary data in a restricted set of
39  * 64 printable ascii symbols, that includes the 62 alphanumerics and '+'
40  * and '/'. Notably it does not include quotes, so that base64 encoded
41  * strings can be used in situations where quotes are used for formatting.
42  * 64 symbols was chosen because it is the smallest number that can be used
43  * in 4-for-3 byte encoding of binary data:
44  * log2(64) / log2(256) = 0.75 = 3/4
45  *
46  * Ascii85 encoding is used in PostScript and some pdf files for
47  * representing binary data (for example, a compressed image) in printable
48  * ascii symbols. It has a dictionary of 85 symbols; 85 was chosen because
49  * it is the smallest number that can be used in 5-for-4 byte encoding
50  * of binary data (256 possible input values). This can be seen from
51  * the max information content in such a sequence:
52  * log2(84) / log2(256) = 0.799 < 4/5
53  * log2(85) / log2(256) = 0.801 > 4/5
54  */
55 
56 #ifdef HAVE_CONFIG_H
57 #include <config_auto.h>
58 #endif /* HAVE_CONFIG_H */
59 
60 #include <ctype.h>
61 #include <string.h>
62 #include "allheaders.h"
63 
64  /* Base64 encoding table in string representation */
65 static const l_int32 MAX_BASE64_LINE = 72; /* max line length base64 */
66 static const char *tablechar64 =
67  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
68  "abcdefghijklmnopqrstuvwxyz"
69  "0123456789+/";
70 
71 static l_int32 isBase64(char);
72 static l_int32 *genReverseTab64(void);
73 static void byteConvert3to4(l_uint8 *in3, l_uint8 *out4);
74 static void byteConvert4to3(l_uint8 *in4, l_uint8 *out3);
75 
76  /* Ascii85 encoding */
77 static const l_int32 MAX_ASCII85_LINE = 64; /* max line length ascii85 */
78 static const l_uint32 power85[5] = {1,
79  85,
80  85 * 85,
81  85 * 85 * 85,
82  85 * 85 * 85 * 85};
83 
84 static l_int32 convertChunkToAscii85(const l_uint8 *inarray, size_t insize,
85  l_int32 *pindex, char *outbuf,
86  l_int32 *pnbout);
87 
88 /*-------------------------------------------------------------*
89  * Utility for encoding and decoding data with base64 *
90  *-------------------------------------------------------------*/
106 char *
107 encodeBase64(const l_uint8 *inarray,
108  l_int32 insize,
109  l_int32 *poutsize)
110 {
111 char *chara;
112 const l_uint8 *bytea;
113 l_uint8 array3[3], array4[4];
114 l_int32 outsize, i, j, index, linecount;
115 
116  if (!poutsize)
117  return (char *)ERROR_PTR("&outsize not defined", __func__, NULL);
118  *poutsize = 0;
119  if (!inarray)
120  return (char *)ERROR_PTR("inarray not defined", __func__, NULL);
121  if (insize <= 0)
122  return (char *)ERROR_PTR("insize not > 0", __func__, NULL);
123 
124  /* The output array is padded to a multiple of 4 bytes, not
125  * counting the newlines. We just need to allocate a large
126  * enough array, and add 4 bytes to make sure it is big enough. */
127  outsize = 4 * ((insize + 2) / 3); /* without newlines */
128  outsize += outsize / MAX_BASE64_LINE + 4; /* with the newlines */
129  if ((chara = (char *)LEPT_CALLOC(outsize, sizeof(char))) == NULL)
130  return (char *)ERROR_PTR("chara not made", __func__, NULL);
131 
132  /* Read all the input data, and convert in sets of 3 input
133  * bytes --> 4 output bytes. */
134  i = index = linecount = 0;
135  bytea = inarray;
136  while (insize--) {
137  if (linecount == MAX_BASE64_LINE) {
138  chara[index++] = '\n';
139  linecount = 0;
140  }
141  array3[i++] = *bytea++;
142  if (i == 3) { /* convert 3 to 4 and save */
143  byteConvert3to4(array3, array4);
144  for (j = 0; j < 4; j++)
145  chara[index++] = tablechar64[array4[j]];
146  i = 0;
147  linecount += 4;
148  }
149  }
150 
151  /* Suppose 1 or 2 bytes has been read but not yet processed.
152  * If 1 byte has been read, this will generate 2 bytes of
153  * output, with 6 bits to the first byte and 2 bits to the second.
154  * We will add two bytes of '=' for padding.
155  * If 2 bytes has been read, this will generate 3 bytes of output,
156  * with 6 bits to the first 2 bytes and 4 bits to the third, and
157  * we add a fourth padding byte ('='). */
158  if (i > 0) { /* left-over 1 or 2 input bytes */
159  for (j = i; j < 3; j++)
160  array3[j] = '\0'; /* zero the remaining input bytes */
161  byteConvert3to4(array3, array4);
162  for (j = 0; j <= i; j++)
163  chara[index++] = tablechar64[array4[j]];
164  for (j = i + 1; j < 4; j++)
165  chara[index++] = '=';
166  }
167  *poutsize = index;
168 
169  return chara;
170 }
171 
172 
192 l_uint8 *
193 decodeBase64(const char *inarray,
194  l_int32 insize,
195  l_int32 *poutsize)
196 {
197 char inchar;
198 l_uint8 *bytea;
199 l_uint8 array3[3], array4[4];
200 l_int32 *rtable64;
201 l_int32 i, j, outsize, in_index, out_index;
202 
203  if (!poutsize)
204  return (l_uint8 *)ERROR_PTR("&outsize not defined", __func__, NULL);
205  *poutsize = 0;
206  if (!inarray)
207  return (l_uint8 *)ERROR_PTR("inarray not defined", __func__, NULL);
208  if (insize <= 0)
209  return (l_uint8 *)ERROR_PTR("insize not > 0", __func__, NULL);
210 
211  /* Validate the input data */
212  for (i = 0; i < insize; i++) {
213  inchar = inarray[i];
214  if (inchar == '\n') continue;
215  if (isBase64(inchar) == 0 && inchar != '=')
216  return (l_uint8 *)ERROR_PTR("invalid char in inarray",
217  __func__, NULL);
218  }
219 
220  /* The input array typically is made with a newline every
221  * MAX_BASE64_LINE input bytes. However, as a printed string, the
222  * newlines would be stripped. So when we allocate the output
223  * array, assume the input array is all data, but strip
224  * out the newlines during decoding. This guarantees that
225  * the allocated array is large enough. */
226  outsize = 3 * ((insize + 3) / 4) + 4;
227  if ((bytea = (l_uint8 *)LEPT_CALLOC(outsize, sizeof(l_uint8))) == NULL)
228  return (l_uint8 *)ERROR_PTR("bytea not made", __func__, NULL);
229 
230  /* The number of encoded input data bytes is always a multiple of 4.
231  * Read all the data, until you reach either the end or
232  * the first pad character '='. The data is processed in
233  * units of 4 input bytes, generating 3 output decoded bytes
234  * of binary data. Newlines are ignored. If there are no
235  * pad bytes, i == 0 at the end of this section. */
236  rtable64 = genReverseTab64();
237  i = in_index = out_index = 0;
238  for (in_index = 0; in_index < insize; in_index++) {
239  inchar = inarray[in_index];
240  if (inchar == '\n') continue;
241  if (inchar == '=') break;
242  array4[i++] = rtable64[(unsigned char)inchar];
243  if (i < 4) {
244  continue;
245  } else { /* i == 4; convert 4 to 3 and save */
246  byteConvert4to3(array4, array3);
247  for (j = 0; j < 3; j++)
248  bytea[out_index++] = array3[j];
249  i = 0;
250  }
251  }
252 
253  /* If i > 0, we ran into pad bytes ('='). If i == 2, there are
254  * two input pad bytes and one output data byte. If i == 3,
255  * there is one input pad byte and two output data bytes. */
256  if (i > 0) {
257  for (j = i; j < 4; j++)
258  array4[j] = '\0'; /* zero the remaining input bytes */
259  byteConvert4to3(array4, array3);
260  for (j = 0; j < i - 1; j++)
261  bytea[out_index++] = array3[j];
262  }
263  *poutsize = out_index;
264 
265  LEPT_FREE(rtable64);
266  return bytea;
267 }
268 
269 
273 static l_int32
274 isBase64(char c)
275 {
276  return (isalnum(((int)c)) || ((c) == '+') || ((c) == '/')) ? 1 : 0;
277 }
278 
282 static l_int32 *
283 genReverseTab64()
284 {
285 l_int32 i;
286 l_int32 *rtable64;
287 
288  rtable64 = (l_int32 *)LEPT_CALLOC(128, sizeof(l_int32));
289  for (i = 0; i < 64; i++) {
290  rtable64[(unsigned char)tablechar64[i]] = i;
291  }
292  return rtable64;
293 }
294 
298 static void
299 byteConvert3to4(l_uint8 *in3,
300  l_uint8 *out4)
301 {
302  out4[0] = in3[0] >> 2;
303  out4[1] = ((in3[0] & 0x03) << 4) | (in3[1] >> 4);
304  out4[2] = ((in3[1] & 0x0f) << 2) | (in3[2] >> 6);
305  out4[3] = in3[2] & 0x3f;
306  return;
307 }
308 
312 static void
313 byteConvert4to3(l_uint8 *in4,
314  l_uint8 *out3)
315 {
316  out3[0] = (in4[0] << 2) | (in4[1] >> 4);
317  out3[1] = ((in4[1] & 0x0f) << 4) | (in4[2] >> 2);
318  out3[2] = ((in4[2] & 0x03) << 6) | in4[3];
319  return;
320 }
321 
322 
323 /*-------------------------------------------------------------*
324  * Utility for encoding and decoding data with ascii85 *
325  *-------------------------------------------------------------*/
341 char *
342 encodeAscii85(const l_uint8 *inarray,
343  size_t insize,
344  size_t *poutsize)
345 {
346 char *chara;
347 char outbuf[8];
348 l_int32 maxsize, i, index, linecount, nbout, eof;
349 size_t outindex;
350 
351  if (!poutsize)
352  return (char *)ERROR_PTR("&outsize not defined", __func__, NULL);
353  *poutsize = 0;
354  if (!inarray)
355  return (char *)ERROR_PTR("inarray not defined", __func__, NULL);
356  if (insize <= 0)
357  return (char *)ERROR_PTR("insize not > 0", __func__, NULL);
358 
359  /* Accumulate results in char array */
360  maxsize = (l_int32)(80. + (insize * 5. / 4.) *
361  (1. + 2. / MAX_ASCII85_LINE));
362  if ((chara = (char *)LEPT_CALLOC(maxsize, sizeof(char))) == NULL)
363  return (char *)ERROR_PTR("chara not made", __func__, NULL);
364 
365  linecount = 0;
366  index = 0;
367  outindex = 0;
368  while (1) {
369  eof = convertChunkToAscii85(inarray, insize, &index, outbuf, &nbout);
370  for (i = 0; i < nbout; i++) {
371  chara[outindex++] = outbuf[i];
372  linecount++;
373  if (linecount >= MAX_ASCII85_LINE) {
374  chara[outindex++] = '\n';
375  linecount = 0;
376  }
377  }
378  if (eof == TRUE) {
379  if (linecount != 0)
380  chara[outindex++] = '\n';
381  chara[outindex++] = '~';
382  chara[outindex++] = '>';
383  chara[outindex++] = '\n';
384  break;
385  }
386  }
387 
388  *poutsize = outindex;
389  return chara;
390 }
391 
392 
409 static l_int32
410 convertChunkToAscii85(const l_uint8 *inarray,
411  size_t insize,
412  l_int32 *pindex,
413  char *outbuf,
414  l_int32 *pnbout)
415 {
416 l_uint8 inbyte;
417 l_uint32 inword, val;
418 l_int32 eof, index, nread, nbout, i;
419 
420  eof = FALSE;
421  index = *pindex;
422  nread = L_MIN(4, (insize - index));
423  if (insize == index + nread)
424  eof = TRUE;
425  *pindex += nread; /* save new index */
426 
427  /* Read input data and save in l_uint32 */
428  inword = 0;
429  for (i = 0; i < nread; i++) {
430  inbyte = inarray[index + i];
431  inword += (l_uint32)inbyte << (8 * (3 - i));
432  }
433 
434 #if 0
435  lept_stderr("index = %d, nread = %d\n", index, nread);
436  lept_stderr("inword = %x\n", inword);
437  lept_stderr("eof = %d\n", eof);
438 #endif
439 
440  /* Special case: output 1 byte only */
441  if (inword == 0) {
442  outbuf[0] = 'z';
443  nbout = 1;
444  } else { /* output nread + 1 bytes */
445  for (i = 4; i >= 4 - nread; i--) {
446  val = inword / power85[i];
447  outbuf[4 - i] = (l_uint8)(val + '!');
448  inword -= val * power85[i];
449  }
450  nbout = nread + 1;
451  }
452  *pnbout = nbout;
453 
454  return eof;
455 }
456 
457 
474 l_uint8 *
475 decodeAscii85(const char *inarray,
476  size_t insize,
477  size_t *poutsize)
478 {
479 char inc;
480 const char *pin;
481 l_uint8 val;
482 l_uint8 *outa;
483 l_int32 maxsize, ocount, bytecount, index;
484 l_uint32 oword;
485 
486  if (!poutsize)
487  return (l_uint8 *)ERROR_PTR("&outsize not defined", __func__, NULL);
488  *poutsize = 0;
489  if (!inarray)
490  return (l_uint8 *)ERROR_PTR("inarray not defined", __func__, NULL);
491  if (insize <= 0)
492  return (l_uint8 *)ERROR_PTR("insize not > 0", __func__, NULL);
493 
494  /* Accumulate results in outa */
495  maxsize = (l_int32)(80. + (insize * 4. / 5.)); /* plenty big */
496  if ((outa = (l_uint8 *)LEPT_CALLOC(maxsize, sizeof(l_uint8))) == NULL)
497  return (l_uint8 *)ERROR_PTR("outa not made", __func__, NULL);
498 
499  pin = inarray;
500  ocount = 0; /* byte index into outa */
501  oword = 0;
502  for (index = 0, bytecount = 0; index < insize; index++, pin++) {
503  inc = *pin;
504 
505  if (inc == ' ' || inc == '\t' || inc == '\n' ||
506  inc == '\f' || inc == '\r' || inc == '\v') /* ignore white space */
507  continue;
508 
509  val = inc - '!';
510  if (val < 85) {
511  oword = oword * 85 + val;
512  if (bytecount < 4) {
513  bytecount++;
514  } else { /* we have all 5 input chars for the oword */
515  outa[ocount] = (oword >> 24) & 0xff;
516  outa[ocount + 1] = (oword >> 16) & 0xff;
517  outa[ocount + 2] = (oword >> 8) & 0xff;
518  outa[ocount + 3] = oword & 0xff;
519  ocount += 4;
520  bytecount = 0;
521  oword = 0;
522  }
523  } else if (inc == 'z' && bytecount == 0) {
524  outa[ocount] = 0;
525  outa[ocount + 1] = 0;
526  outa[ocount + 2] = 0;
527  outa[ocount + 3] = 0;
528  ocount += 4;
529  } else if (inc == '~') { /* end of data */
530  L_INFO(" %d extra bytes output\n", __func__, bytecount - 1);
531  switch (bytecount) {
532  case 0: /* normal eof */
533  case 1: /* error */
534  break;
535  case 2: /* 1 extra byte */
536  oword = oword * power85[3] + 0xffffff;
537  outa[ocount] = (oword >> 24) & 0xff;
538  break;
539  case 3: /* 2 extra bytes */
540  oword = oword * power85[2] + 0xffff;
541  outa[ocount] = (oword >> 24) & 0xff;
542  outa[ocount + 1] = (oword >> 16) & 0xff;
543  break;
544  case 4: /* 3 extra bytes */
545  oword = oword * 85 + 0xff;
546  outa[ocount] = (oword >> 24) & 0xff;
547  outa[ocount + 1] = (oword >> 16) & 0xff;
548  outa[ocount + 2] = (oword >> 8) & 0xff;
549  break;
550  }
551  if (bytecount > 1)
552  ocount += (bytecount - 1);
553  break;
554  }
555  }
556  *poutsize = ocount;
557 
558  return outa;
559 }
560 
561 
577 char *
578 encodeAscii85WithComp(const l_uint8 *indata,
579  size_t insize,
580  size_t *poutsize)
581 {
582 char *outstr;
583 size_t size1;
584 l_uint8 *data1;
585 
586  if (!poutsize)
587  return (char *)ERROR_PTR("&outsize not defined", __func__, NULL);
588  *poutsize = 0;
589  if (!indata)
590  return (char *)ERROR_PTR("indata not defined", __func__, NULL);
591 
592  if ((data1 = zlibCompress(indata, insize, &size1)) == NULL)
593  return (char *)ERROR_PTR("data1 not made", __func__, NULL);
594  outstr = encodeAscii85(data1, size1, poutsize);
595  LEPT_FREE(data1);
596  return outstr;
597 }
598 
599 
616 l_uint8 *
617 decodeAscii85WithComp(const char *instr,
618  size_t insize,
619  size_t *poutsize)
620 {
621 size_t size1;
622 l_uint8 *data1, *outdata;
623 
624  if (!poutsize)
625  return (l_uint8 *)ERROR_PTR("&outsize not defined", __func__, NULL);
626  *poutsize = 0;
627  if (!instr)
628  return (l_uint8 *)ERROR_PTR("instr not defined", __func__, NULL);
629 
630  if (insize == 0) insize = strlen(instr);
631  if ((data1 = decodeAscii85(instr, insize, &size1)) == NULL)
632  return (l_uint8 *)ERROR_PTR("data1 not made", __func__, NULL);
633  outdata = zlibUncompress(data1, size1, poutsize);
634  LEPT_FREE(data1);
635  return outdata;
636 }
637 
638 
639 /*-------------------------------------------------------------*
640  * String reformatting for base 64 encoded data *
641  *-------------------------------------------------------------*/
663 char *
664 reformatPacked64(const char *inarray,
665  l_int32 insize,
666  l_int32 leadspace,
667  l_int32 linechars,
668  l_int32 addquotes,
669  l_int32 *poutsize)
670 {
671 char *flata, *outa;
672 l_int32 i, j, flatindex, flatsize, outindex, nlines, linewithpad, linecount;
673 
674  if (!poutsize)
675  return (char *)ERROR_PTR("&outsize not defined", __func__, NULL);
676  *poutsize = 0;
677  if (!inarray)
678  return (char *)ERROR_PTR("inarray not defined", __func__, NULL);
679  if (insize <= 0)
680  return (char *)ERROR_PTR("insize not > 0", __func__, NULL);
681  if (leadspace < 0)
682  return (char *)ERROR_PTR("leadspace must be >= 0", __func__, NULL);
683  if (linechars % 4)
684  return (char *)ERROR_PTR("linechars % 4 must be 0", __func__, NULL);
685 
686  /* Remove all white space */
687  if ((flata = (char *)LEPT_CALLOC(insize, sizeof(char))) == NULL)
688  return (char *)ERROR_PTR("flata not made", __func__, NULL);
689  for (i = 0, flatindex = 0; i < insize; i++) {
690  if (isBase64(inarray[i]) || inarray[i] == '=')
691  flata[flatindex++] = inarray[i];
692  }
693 
694  /* Generate output string */
695  flatsize = flatindex;
696  nlines = (flatsize + linechars - 1) / linechars;
697  linewithpad = leadspace + linechars + 1; /* including newline */
698  if (addquotes) linewithpad += 2;
699  if ((outa = (char *)LEPT_CALLOC((size_t)nlines * linewithpad,
700  sizeof(char))) == NULL) {
701  LEPT_FREE(flata);
702  return (char *)ERROR_PTR("outa not made", __func__, NULL);
703  }
704  for (j = 0, outindex = 0; j < leadspace; j++)
705  outa[outindex++] = ' ';
706  if (addquotes) outa[outindex++] = '"';
707  for (i = 0, linecount = 0; i < flatsize; i++) {
708  if (linecount == linechars) {
709  if (addquotes) outa[outindex++] = '"';
710  outa[outindex++] = '\n';
711  for (j = 0; j < leadspace; j++)
712  outa[outindex++] = ' ';
713  if (addquotes) outa[outindex++] = '"';
714  linecount = 0;
715  }
716  outa[outindex++] = flata[i];
717  linecount++;
718  }
719  if (addquotes) outa[outindex++] = '"';
720  *poutsize = outindex;
721 
722  LEPT_FREE(flata);
723  return outa;
724 }
void lept_stderr(const char *fmt,...)
lept_stderr()
Definition: utils1.c:306
l_uint8 * zlibUncompress(const l_uint8 *datain, size_t nin, size_t *pnout)
zlibUncompress()
Definition: zlibmem.c:193
l_uint8 * zlibCompress(const l_uint8 *datain, size_t nin, size_t *pnout)
zlibCompress()
Definition: zlibmem.c:92