summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog15
-rw-r--r--doc/ChangeLog5
-rw-r--r--doc/coreutils.texi3
-rw-r--r--gl/lib/base64.c256
-rw-r--r--gl/lib/base64.h13
-rw-r--r--src/base64.c34
-rwxr-xr-xtests/misc/base64104
7 files changed, 340 insertions, 90 deletions
diff --git a/ChangeLog b/ChangeLog
index 7de2a50c9..50d56dae5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
2007-01-04 Jim Meyering <jim@meyering.net>
+ When decoding, always allow newlines in input, with almost no
+ performance impact.
+ * src/base64.c (do_decode): Initialize decode context.
+ Call base64_decode one more time, after all input is processed.
+ (usage): When decoding, newlines are always accepted.
+
+ * tests/misc/base64: Add a bunch of tests, for the above.
+ * gl/lib/base64.c: Include <string.h>.
+ (base64_decode_ctx_init, get_4, decode_4): New functions.
+ (base64_decode): Efficiently handle interspersed newlines.
+ (base64_decode_alloc): Update signature.
+ * gl/lib/base64.h (struct base64_decode_context): Define.
+ (base64_decode_ctx_init): Add prototype.
+ (base64_decode, base64_decode_alloc): Update prototypes.
+
* gl/lib/base64.c: Copied from gnulib.
* gl/lib/base64.h: Likewise.
diff --git a/doc/ChangeLog b/doc/ChangeLog
index 753aac22e..5f590af3f 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,8 @@
+2007-01-04 Jim Meyering <jim@meyering.net>
+
+ * coreutils.texi (base64 invocation): When decoding, newlines
+ are always accepted.
+
2007-01-03 Jim Meyering <jim@meyering.net>
Document what the ".0" in e.g., "-k 2,3.0" means, and...
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index ae072d74d..6fc670479 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -1849,7 +1849,8 @@ output will be the original data.
@opindex -i
@opindex --ignore-garbage
@cindex Ignore garbage in base64 stream
-During decoding, ignore unrecognized characters (including newline),
+When decoding, newlines are always accepted.
+During decoding, ignore unrecognized bytes,
to permit distorted data to be decoded.
@end table
diff --git a/gl/lib/base64.c b/gl/lib/base64.c
index f237cd63b..ba7322a26 100644
--- a/gl/lib/base64.c
+++ b/gl/lib/base64.c
@@ -1,5 +1,5 @@
/* base64.c -- Encode binary data using printable characters.
- Copyright (C) 1999, 2000, 2001, 2004, 2005, 2006 Free Software
+ Copyright (C) 1999, 2000, 2001, 2004, 2005, 2006, 2007 Free Software
Foundation, Inc.
This program is free software; you can redistribute it and/or modify
@@ -52,6 +52,8 @@
/* Get UCHAR_MAX. */
#include <limits.h>
+#include <string.h>
+
/* C89 compliant way to cast 'char' to 'unsigned char'. */
static inline unsigned char
to_uchar (char ch)
@@ -300,89 +302,220 @@ isbase64 (char ch)
return uchar_in_range (to_uchar (ch)) && 0 <= b64[to_uchar (ch)];
}
-/* Decode base64 encoded input array IN of length INLEN to output
- array OUT that can hold *OUTLEN bytes. Return true if decoding was
- successful, i.e. if the input was valid base64 data, false
- otherwise. If *OUTLEN is too small, as many bytes as possible will
- be written to OUT. On return, *OUTLEN holds the length of decoded
- bytes in OUT. Note that as soon as any non-alphabet characters are
- encountered, decoding is stopped and false is returned. This means
- that, when applicable, you must remove any line terminators that is
- part of the data stream before calling this function. */
-bool
-base64_decode (const char *restrict in, size_t inlen,
- char *restrict out, size_t *outlen)
+/* Initialize decode-context buffer, CTX. */
+void
+base64_decode_ctx_init (struct base64_decode_context *ctx)
{
- size_t outleft = *outlen;
+ ctx->i = 0;
+}
- while (inlen >= 2)
- {
- if (!isbase64 (in[0]) || !isbase64 (in[1]))
- break;
+/* If CTX->i is 0 or 4, there are four or more bytes in [*IN..IN_END), and
+ none of those four is a newline, then return *IN. Otherwise, copy up to
+ 4 - CTX->i non-newline bytes from that range into CTX->buf, starting at
+ index CTX->i and setting CTX->i to reflect the number of bytes copied,
+ and return CTX->buf. In either case, advance *IN to point to the byte
+ after the last one processed, and set *N_NON_NEWLINE to the number of
+ verified non-newline bytes accessible through the returned pointer. */
+static inline char *
+get_4 (struct base64_decode_context *ctx,
+ char const *restrict *in, char const *restrict in_end,
+ size_t *n_non_newline)
+{
+ if (ctx->i == 4)
+ ctx->i = 0;
- if (outleft)
+ if (ctx->i == 0)
+ {
+ char const *t = *in;
+ if (4 <= in_end - *in && memchr (t, '\n', 4) == NULL)
{
- *out++ = ((b64[to_uchar (in[0])] << 2)
- | (b64[to_uchar (in[1])] >> 4));
- outleft--;
+ /* This is the common case: no newline. */
+ *in += 4;
+ *n_non_newline = 4;
+ return (char *) t;
}
+ }
- if (inlen == 2)
- break;
+ {
+ /* Copy non-newline bytes into BUF. */
+ char const *p = *in;
+ while (p < in_end)
+ {
+ char c = *p++;
+ if (c != '\n')
+ {
+ ctx->buf[ctx->i++] = c;
+ if (ctx->i == 4)
+ break;
+ }
+ }
+
+ *in = p;
+ *n_non_newline = ctx->i;
+ return ctx->buf;
+ }
+}
+
+#define return_false \
+ do \
+ { \
+ *outp = out; \
+ return false; \
+ } \
+ while (false)
+
+/* Decode up to four bytes of base64-encoded data, IN, of length INLEN
+ into the output buffer, *OUT, of size *OUTLEN bytes. Return true if
+ decoding is successful, false otherwise. If *OUTLEN is too small,
+ as many bytes as possible are written to *OUT. On return, advance
+ *OUT to point to the byte after the last one written, and decrement
+ *OUTLEN to reflect the number of bytes remaining in *OUT. */
+static inline bool
+decode_4 (char const *restrict in, size_t inlen,
+ char *restrict *outp, size_t *outleft)
+{
+ char *out = *outp;
+ if (inlen < 2)
+ return false;
- if (in[2] == '=')
+ if (!isbase64 (in[0]) || !isbase64 (in[1]))
+ return false;
+
+ if (*outleft)
+ {
+ *out++ = ((b64[to_uchar (in[0])] << 2)
+ | (b64[to_uchar (in[1])] >> 4));
+ --*outleft;
+ }
+
+ if (inlen == 2)
+ return_false;
+
+ if (in[2] == '=')
+ {
+ if (inlen != 4)
+ return_false;
+
+ if (in[3] != '=')
+ return_false;
+ }
+ else
+ {
+ if (!isbase64 (in[2]))
+ return_false;
+
+ if (*outleft)
{
- if (inlen != 4)
- break;
+ *out++ = (((b64[to_uchar (in[1])] << 4) & 0xf0)
+ | (b64[to_uchar (in[2])] >> 2));
+ --*outleft;
+ }
- if (in[3] != '=')
- break;
+ if (inlen == 3)
+ return_false;
+ if (in[3] == '=')
+ {
+ if (inlen != 4)
+ return_false;
}
else
{
- if (!isbase64 (in[2]))
- break;
+ if (!isbase64 (in[3]))
+ return_false;
- if (outleft)
+ if (*outleft)
{
- *out++ = (((b64[to_uchar (in[1])] << 4) & 0xf0)
- | (b64[to_uchar (in[2])] >> 2));
- outleft--;
+ *out++ = (((b64[to_uchar (in[2])] << 6) & 0xc0)
+ | b64[to_uchar (in[3])]);
+ --*outleft;
}
+ }
+ }
- if (inlen == 3)
- break;
+ *outp = out;
+ return true;
+}
- if (in[3] == '=')
- {
- if (inlen != 4)
- break;
- }
- else
+/* Decode base64-encoded input array IN of length INLEN to output array
+ OUT that can hold *OUTLEN bytes. The input data may be interspersed
+ with newlines. Return true if decoding was successful, i.e. if the
+ input was valid base64 data, false otherwise. If *OUTLEN is too
+ small, as many bytes as possible will be written to OUT. On return,
+ *OUTLEN holds the length of decoded bytes in OUT. Note that as soon
+ as any non-alphabet, non-newline character is encountered, decoding
+ is stopped and false is returned. If INLEN is zero, then process
+ only whatever data is stored in CTX.
+
+ Initially, CTX must have been initialized via base64_decode_ctx_init.
+ Subsequent calls to this function must reuse whatever state is recorded
+ in that buffer. It is necessary for when a quadruple of base64 input
+ bytes spans two input buffers. */
+
+bool
+base64_decode (struct base64_decode_context *ctx,
+ const char *restrict in, size_t inlen,
+ char *restrict out, size_t *outlen)
+{
+ size_t outleft = *outlen;
+ bool flush_ctx = inlen == 0;
+
+ while (true)
+ {
+ size_t outleft_save = outleft;
+ if (ctx->i == 0 && !flush_ctx)
+ {
+ while (true)
{
- if (!isbase64 (in[3]))
+ /* Save a copy of outleft, in case we need to re-parse this
+ block of four bytes. */
+ outleft_save = outleft;
+ if (!decode_4 (in, inlen, &out, &outleft))
break;
- if (outleft)
- {
- *out++ = (((b64[to_uchar (in[2])] << 6) & 0xc0)
- | b64[to_uchar (in[3])]);
- outleft--;
- }
+ in += 4;
+ inlen -= 4;
}
}
- in += 4;
- inlen -= 4;
+ if (inlen == 0 && !flush_ctx)
+ break;
+
+ /* Handle the common case of 72-byte wrapped lines.
+ This also handles any other multiple-of-4-byte wrapping. */
+ if (inlen && *in == '\n')
+ {
+ ++in;
+ --inlen;
+ continue;
+ }
+
+ /* Restore OUT and OUTLEFT. */
+ out -= outleft_save - outleft;
+ outleft = outleft_save;
+
+ {
+ char const *in_end = in + inlen;
+ char const *non_nl = get_4 (ctx, &in, in_end, &inlen);
+
+ /* If the input is empty or consists solely of newlines (0 non-newlines),
+ then we're done. Likewise if there are fewer than 4 bytes when not
+ flushing context. */
+ if (inlen == 0 || (inlen < 4 && !flush_ctx))
+ {
+ inlen = 0;
+ break;
+ }
+ if (!decode_4 (non_nl, 4, &out, &outleft))
+ break;
+
+ inlen = in_end - in;
+ }
}
*outlen -= outleft;
- if (inlen != 0)
- return false;
-
- return true;
+ return inlen == 0;
}
/* Allocate an output buffer in *OUT, and decode the base64 encoded
@@ -397,12 +530,13 @@ base64_decode (const char *restrict in, size_t inlen,
input was invalid, in which case *OUT is NULL and *OUTLEN is
undefined. */
bool
-base64_decode_alloc (const char *in, size_t inlen, char **out,
+base64_decode_alloc (struct base64_decode_context *ctx,
+ const char *in, size_t inlen, char **out,
size_t *outlen)
{
- /* This may allocate a few bytes too much, depending on input,
- but it's not worth the extra CPU time to compute the exact amount.
- The exact amount is 3 * inlen / 4, minus 1 if the input ends
+ /* This may allocate a few bytes too many, depending on input,
+ but it's not worth the extra CPU time to compute the exact size.
+ The exact size is 3 * inlen / 4, minus 1 if the input ends
with "=" and minus another 1 if the input ends with "==".
Dividing before multiplying avoids the possibility of overflow. */
size_t needlen = 3 * (inlen / 4) + 2;
@@ -411,7 +545,7 @@ base64_decode_alloc (const char *in, size_t inlen, char **out,
if (!*out)
return true;
- if (!base64_decode (in, inlen, *out, &needlen))
+ if (!base64_decode (ctx, in, inlen, *out, &needlen))
{
free (*out);
*out = NULL;
diff --git a/gl/lib/base64.h b/gl/lib/base64.h
index 6bb9a97f5..be858eaaf 100644
--- a/gl/lib/base64.h
+++ b/gl/lib/base64.h
@@ -29,6 +29,12 @@
integer >= n/k, i.e., the ceiling of n/k. */
# define BASE64_LENGTH(inlen) ((((inlen) + 2) / 3) * 4)
+struct base64_decode_context
+{
+ unsigned int i;
+ char buf[4];
+};
+
extern bool isbase64 (char ch);
extern void base64_encode (const char *restrict in, size_t inlen,
@@ -36,10 +42,13 @@ extern void base64_encode (const char *restrict in, size_t inlen,
extern size_t base64_encode_alloc (const char *in, size_t inlen, char **out);
-extern bool base64_decode (const char *restrict in, size_t inlen,
+extern void base64_decode_ctx_init (struct base64_decode_context *ctx);
+extern bool base64_decode (struct base64_decode_context *ctx,
+ const char *restrict in, size_t inlen,
char *restrict out, size_t *outlen);
-extern bool base64_decode_alloc (const char *in, size_t inlen,
+extern bool base64_decode_alloc (struct base64_decode_context *ctx,
+ const char *in, size_t inlen,
char **out, size_t *outlen);
#endif /* BASE64_H */
diff --git a/src/base64.c b/src/base64.c
index 36afbe4df..9f8ff417c 100644
--- a/src/base64.c
+++ b/src/base64.c
@@ -1,5 +1,5 @@
/* Base64 encode/decode strings or files.
- Copyright (C) 2004, 2005, 2006 Free Software Foundation, Inc.
+ Copyright (C) 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
This file is part of Base64.
@@ -83,9 +83,10 @@ With no FILE, or when FILE is -, read standard input.\n"), stdout);
fputs (_("\
\n\
The data are encoded as described for the base64 alphabet in RFC 3548.\n\
-Decoding require compliant input by default, use --ignore-garbage to\n\
-attempt to recover from non-alphabet characters (such as newlines) in\n\
-the encoded stream.\n"), stdout);
+When decoding, the input may contain newlines in addition to the bytes of\n\
+the formal base64 alphabet. Use --ignore-garbage to attempt to recover\n\
+from any other non-alphabet bytes in the encoded stream.\n"),
+ stdout);
printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
}
@@ -183,11 +184,15 @@ do_decode (FILE *in, FILE *out, bool ignore_garbage)
char inbuf[B64BLOCKSIZE];
char outbuf[BLOCKSIZE];
size_t sum;
+ struct base64_decode_context ctx;
+
+ base64_decode_ctx_init (&ctx);
do
{
bool ok;
size_t n;
+ unsigned int k;
sum = 0;
do
@@ -211,14 +216,23 @@ do_decode (FILE *in, FILE *out, bool ignore_garbage)
}
while (sum < B64BLOCKSIZE && !feof (in));
- n = BLOCKSIZE;
- ok = base64_decode (inbuf, sum, outbuf, &n);
+ /* The following "loop" is usually iterated just once.
+ However, when it processes the final input buffer, we want
+ to iterate it one additional time, but with an indicator
+ telling it to flush what is in CTX. */
+ for (k = 0; k < 1 + feof (in); k++)
+ {
+ if (k == 1 && ctx.i == 0)
+ break;
+ n = BLOCKSIZE;
+ ok = base64_decode (&ctx, inbuf, (k == 0 ? sum : 0), outbuf, &n);
- if (fwrite (outbuf, 1, n, out) < n)
- error (EXIT_FAILURE, errno, _("write error"));
+ if (fwrite (outbuf, 1, n, out) < n)
+ error (EXIT_FAILURE, errno, _("write error"));
- if (!ok)
- error (EXIT_FAILURE, 0, _("invalid input"));
+ if (!ok)
+ error (EXIT_FAILURE, 0, _("invalid input"));
+ }
}
while (!feof (in));
}
diff --git a/tests/misc/base64 b/tests/misc/base64
index 60332a044..70295fb7f 100755
--- a/tests/misc/base64
+++ b/tests/misc/base64
@@ -2,7 +2,7 @@
# -*- perl -*-
# Exercise base64.
-# Copyright (C) 2006 Free Software Foundation, Inc.
+# Copyright (C) 2006, 2007 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -37,21 +37,69 @@ use strict;
# Turn off localisation of executable's ouput.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
-(my $a39 = 'YWFh' x 13) =~ s/(.{5})/$1\n/g;
+# Return the encoding of a string of N 'a's.
+sub enc($)
+{
+ my ($n) = @_;
+ my %remainder = ( 0 => '', 1 => 'YQ==', 2 => 'YWE=' );
+ return 'YWFh' x ($n / 3) . $remainder{$n % 3};
+}
+
+# Construct an encoded string of length 4KB, using 3K "a"s.
+my $a3k = enc 3072;
+my @a3k_nl;
+# A few copies, each with different number of newlines at the start.
+for my $k (0..3)
+ {
+ (my $t = $a3k) =~ s/^/"\n"x $k/e;
+ push @a3k_nl, $t;
+ }
+
+# Return a copy of S, with newlines inserted every WIDTH bytes.
+# Ensure that the result (if not the empty string) is newline-terminated.
+sub wrap($$)
+{
+ my ($s, $width) = @_;
+ $s =~ s/(.{$width})/$1\n/g;
+ substr ($s, -1, 1) ne "\n"
+ and $s .= "\n";
+ return $s;
+}
my @Tests =
(
['empty', {IN=>''}, {OUT=>""}],
['inout', {IN=>'a'}, {OUT=>"YQ==\n"}],
['wrap', '--wrap 0', {IN=>'foo'}, {OUT=>'Zm9v'}],
- ['wrap5-39', '--wrap=5', {IN=>'a' x 39}, {OUT=>"${a39}\n"}],
- ['wrap5-40', '--wrap=5', {IN=>'a' x 40}, {OUT=>"${a39}YQ=\n=\n"}],
- ['wrap5-41', '--wrap=5', {IN=>'a' x 41}, {OUT=>"${a39}YWE\n=\n"}],
- ['wrap5-42', '--wrap=5', {IN=>'a' x 42}, {OUT=>"${a39}YWF\nh\n"}],
- ['wrap5-43', '--wrap=5', {IN=>'a' x 43}, {OUT=>"${a39}YWF\nhYQ==\n"}],
- ['wrap5-44', '--wrap=5', {IN=>'a' x 44}, {OUT=>"${a39}YWF\nhYWE=\n"}],
- ['wrap5-45', '--wrap=5', {IN=>'a' x 45}, {OUT=>"${a39}YWF\nhYWFh\n"}],
- ['wrap5-46', '--wrap=5', {IN=>'a' x 46}, {OUT=>"${a39}YWF\nhYWFh\nYQ==\n"}],
+ ['wrap5-39', '--wrap=5', {IN=>'a' x 39}, {OUT=>wrap enc(39),5}],
+ ['wrap5-40', '--wrap=5', {IN=>'a' x 40}, {OUT=>wrap enc(40),5}],
+ ['wrap5-41', '--wrap=5', {IN=>'a' x 41}, {OUT=>wrap enc(41),5}],
+ ['wrap5-42', '--wrap=5', {IN=>'a' x 42}, {OUT=>wrap enc(42),5}],
+ ['wrap5-43', '--wrap=5', {IN=>'a' x 43}, {OUT=>wrap enc(43),5}],
+ ['wrap5-44', '--wrap=5', {IN=>'a' x 44}, {OUT=>wrap enc(44),5}],
+ ['wrap5-45', '--wrap=5', {IN=>'a' x 45}, {OUT=>wrap enc(45),5}],
+ ['wrap5-46', '--wrap=5', {IN=>'a' x 46}, {OUT=>wrap enc(46),5}],
+
+ ['buf-1', '--decode', {IN=>enc 1}, {OUT=>'a' x 1}],
+ ['buf-2', '--decode', {IN=>enc 2}, {OUT=>'a' x 2}],
+ ['buf-3', '--decode', {IN=>enc 3}, {OUT=>'a' x 3}],
+ ['buf-4', '--decode', {IN=>enc 4}, {OUT=>'a' x 4}],
+ # 4KB worth of input.
+ ['buf-4k0', '--decode', {IN=>enc 3072+0}, {OUT=>'a' x (3072+0)}],
+ ['buf-4k1', '--decode', {IN=>enc 3072+1}, {OUT=>'a' x (3072+1)}],
+ ['buf-4k2', '--decode', {IN=>enc 3072+2}, {OUT=>'a' x (3072+2)}],
+ ['buf-4k3', '--decode', {IN=>enc 3072+3}, {OUT=>'a' x (3072+3)}],
+ ['buf-4km1','--decode', {IN=>enc 3072-1}, {OUT=>'a' x (3072-1)}],
+ ['buf-4km2','--decode', {IN=>enc 3072-2}, {OUT=>'a' x (3072-2)}],
+ ['buf-4km3','--decode', {IN=>enc 3072-3}, {OUT=>'a' x (3072-3)}],
+ ['buf-4km4','--decode', {IN=>enc 3072-4}, {OUT=>'a' x (3072-4)}],
+
+ # Exercise the case in which the final base-64 byte is
+ # in a buffer all by itself.
+ ['b4k-1', '--decode', {IN=>$a3k_nl[1]}, {OUT=>'a' x (3072+0)}],
+ ['b4k-2', '--decode', {IN=>$a3k_nl[2]}, {OUT=>'a' x (3072+0)}],
+ ['b4k-3', '--decode', {IN=>$a3k_nl[3]}, {OUT=>'a' x (3072+0)}],
+
['baddecode', '--decode', {IN=>'a'}, {OUT=>""},
{ERR_SUBST => 's/.*: invalid input//'}, {ERR => "\n"}, {EXIT => 1}],
['baddecode2', '--decode', {IN=>'ab'}, {OUT=>"i"},
@@ -65,13 +113,18 @@ my @Tests =
);
# For each non-failing test, create a --decode test using the
-# expected output (with newlines removed) as input.
+# expected output as input. Also, add tests inserting newlines.
my @new;
foreach my $t (@Tests)
{
my $exit_val;
my $in;
- my $out;
+ my @out;
+
+ # If the test has a single option of "--decode", then skip it.
+ !ref $t->[1] && $t->[1] eq '--decode'
+ and next;
+
foreach my $e (@$t)
{
ref $e && ref $e eq 'HASH'
@@ -80,11 +133,30 @@ foreach my $t (@Tests)
and $exit_val = $e->{EXIT};
defined $e->{IN}
and $in = $e->{IN};
- defined $e->{OUT}
- and ($out = $e->{OUT}) =~ tr/\n//d;
+ if (defined $e->{OUT})
+ {
+ my $t = $e->{OUT};
+ push @out, $t;
+ my $len = length $t;
+ foreach my $i (0..$len)
+ {
+ my $u = $t;
+ substr ($u, $i, 0) = "\n";
+ push @out, $u;
+ 10 <= $i
+ and last;
+ }
+ }
+ }
+ $exit_val
+ and next;
+
+ my $i = 0;
+ foreach my $o (@out)
+ {
+ push @new, ["d$i-$t->[0]", '--decode', {IN => $o}, {OUT => $in}];
+ ++$i;
}
- defined $out && ! $exit_val
- and push @new, ["d-$t->[0]", '--decode', {IN => $out}, {OUT => $in}];
}
push @Tests, @new;