From 9459d9d8112fe7816022665b5016c2014bb625f3 Mon Sep 17 00:00:00 2001 From: Pádraig Brady Date: Thu, 22 Oct 2015 14:34:08 +0100 Subject: copy,dd: simplify and optimize NUL bytes detection * src/factor.c: Move LIKELY() definition to... * src/system.h: ...here. (is_nul): Reimplement with a version that doesn't require a sentinel after the buffer, and which calls down to (the system optimized) memcmp. Performance analyzed at http://rusty.ozlabs.org/?p=560 * src/dd.c (alloc_obuf): Simplify the is_nul() call by not needing to write the sentinel. * src/copy.c (sparse_copy): Likewise. (copy_reg): Simplify the buffer allocation by avoiding consideration of the sentinel in the buffer size calculation. --- src/copy.c | 22 ++++--------------- src/dd.c | 6 ----- src/factor.c | 8 ------- src/system.h | 72 +++++++++++++++++++++++++++++++++++++++++++++--------------- 4 files changed, 58 insertions(+), 50 deletions(-) diff --git a/src/copy.c b/src/copy.c index 5fe69ea3c..edf022ef1 100644 --- a/src/copy.c +++ b/src/copy.c @@ -245,17 +245,7 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, csize = MIN (csize, n_read); if (hole_size && csize) - { - /* Setup sentinel required by is_nul(). */ - typedef uintptr_t word; - word isnul_tmp; - memcpy (&isnul_tmp, cbuf + csize, sizeof (word)); - memset (cbuf + csize, 1, sizeof (word)); - - make_hole = is_nul (cbuf, csize); - - memcpy (cbuf + csize, &isnul_tmp, sizeof (word)); - } + make_hole = is_nul (cbuf, csize); bool transition = (make_hole != prev_hole) && psize; bool last_chunk = (n_read == csize && ! make_hole) || ! csize; @@ -1201,11 +1191,8 @@ copy_reg (char const *src_name, char const *dst_name, if (data_copy_required) { - typedef uintptr_t word; - /* Choose a suitable buffer size; it may be adjusted later. */ - size_t buf_alignment = lcm (getpagesize (), sizeof (word)); - size_t buf_alignment_slop = sizeof (word) + buf_alignment - 1; + size_t buf_alignment = getpagesize (); size_t buf_size = io_blksize (sb); size_t hole_size = ST_BLKSIZE (sb); @@ -1236,7 +1223,7 @@ copy_reg (char const *src_name, char const *dst_name, { /* Compute the least common multiple of the input and output buffer sizes, adjusting for outlandish values. */ - size_t blcm_max = MIN (SIZE_MAX, SSIZE_MAX) - buf_alignment_slop; + size_t blcm_max = MIN (SIZE_MAX, SSIZE_MAX) - buf_alignment; size_t blcm = buffer_lcm (io_blksize (src_open_sb), buf_size, blcm_max); @@ -1254,8 +1241,7 @@ copy_reg (char const *src_name, char const *dst_name, buf_size = blcm; } - /* Make a buffer with space for a sentinel at the end. */ - buf_alloc = xmalloc (buf_size + buf_alignment_slop); + buf_alloc = xmalloc (buf_size + buf_alignment); buf = ptr_align (buf_alloc, buf_alignment); if (sparse_src) diff --git a/src/dd.c b/src/dd.c index 06beabf78..a1485f609 100644 --- a/src/dd.c +++ b/src/dd.c @@ -20,7 +20,6 @@ #define SWAB_ALIGN_OFFSET 2 -#include #include #include #include @@ -728,11 +727,6 @@ alloc_obuf (void) alloc_ibuf (); obuf = ibuf; } - - /* Write a sentinel to the slop after the buffer, - to allow efficient checking for NUL blocks. */ - assert (sizeof (uintptr_t) <= OUTPUT_BLOCK_SLOP); - memset (obuf + output_blocksize, 1, sizeof (uintptr_t)); } static void diff --git a/src/factor.c b/src/factor.c index e4ce4a5ea..ee578bfe5 100644 --- a/src/factor.c +++ b/src/factor.c @@ -705,14 +705,6 @@ static bool flag_prove_primality = PROVE_PRIMALITY; /* Number of Miller-Rabin tests to run when not proving primality. */ #define MR_REPS 25 -#ifdef __GNUC__ -# define LIKELY(cond) __builtin_expect ((cond), 1) -# define UNLIKELY(cond) __builtin_expect ((cond), 0) -#else -# define LIKELY(cond) (cond) -# define UNLIKELY(cond) (cond) -#endif - static void factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i, unsigned int off) diff --git a/src/system.h b/src/system.h index 8f6a2ea84..1cd6bdb44 100644 --- a/src/system.h +++ b/src/system.h @@ -427,6 +427,15 @@ enum # define ATTRIBUTE_WARN_UNUSED_RESULT __attribute__ ((__warn_unused_result__)) #endif +#ifdef __GNUC__ +# define LIKELY(cond) __builtin_expect ((cond), 1) +# define UNLIKELY(cond) __builtin_expect ((cond), 0) +#else +# define LIKELY(cond) (cond) +# define UNLIKELY(cond) (cond) +#endif + + #if defined strdupa # define ASSIGN_STRDUPA(DEST, S) \ do { DEST = strdupa (S); } while (0) @@ -487,27 +496,54 @@ ptr_align (void const *ptr, size_t alignment) } /* Return whether the buffer consists entirely of NULs. - Note the word after the buffer must be non NUL. */ + Based on memeqzero in CCAN by Rusty Russell under CC0 (Public domain). */ static inline bool _GL_ATTRIBUTE_PURE -is_nul (void const *buf, size_t bufsize) +is_nul (void const *buf, size_t length) { - typedef uintptr_t word; - void const *vp; - char const *cbuf = buf; - word const *wp = buf; - - /* Find first nonzero *word*, or the word with the sentinel. */ - while (*wp++ == 0) - continue; - - /* Find the first nonzero *byte*, or the sentinel. */ - vp = wp - 1; - char const *cp = vp; - while (*cp++ == 0) - continue; - - return cbuf + bufsize < cp; + const unsigned char *p = buf; +/* Using possibly unaligned access for the first 16 bytes + saves about 30-40 cycles, though it is strictly undefined behavior + and so would need __attribute__ ((__no_sanitize_undefined__)) + to avoid -fsanitize=undefined warnings. + Considering coreutils is mainly concerned with relatively + large buffers, we'll just use the defined behavior. */ +#if 0 && _STRING_ARCH_unaligned + unsigned long word; +#else + unsigned char word; +#endif + + if (! length) + return true; + + /* Check len bytes not aligned on a word. */ + while (UNLIKELY (length & (sizeof word - 1))) + { + if (*p) + return false; + p++; + length--; + if (! length) + return true; + } + + /* Check up to 16 bytes a word at a time. */ + for (;;) + { + memcpy (&word, p, sizeof word); + if (word) + return false; + p += sizeof word; + length -= sizeof word; + if (! length) + return true; + if (UNLIKELY (length & 15) == 0) + break; + } + + /* Now we know first 16 bytes are NUL, memcmp with self. */ + return memcmp (buf, p, length) == 0; } /* If 10*Accum + Digit_val is larger than the maximum value for Type, -- cgit v1.2.3-54-g00ecf