From 9459d9d8112fe7816022665b5016c2014bb625f3 Mon Sep 17 00:00:00 2001 From: Pádraig Brady Date: Thu, 22 Oct 2015 14:34:08 +0100 Subject: copy,dd: simplify and optimize NUL bytes detection * src/factor.c: Move LIKELY() definition to... * src/system.h: ...here. (is_nul): Reimplement with a version that doesn't require a sentinel after the buffer, and which calls down to (the system optimized) memcmp. Performance analyzed at http://rusty.ozlabs.org/?p=560 * src/dd.c (alloc_obuf): Simplify the is_nul() call by not needing to write the sentinel. * src/copy.c (sparse_copy): Likewise. (copy_reg): Simplify the buffer allocation by avoiding consideration of the sentinel in the buffer size calculation. --- src/system.h | 72 +++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 18 deletions(-) (limited to 'src/system.h') diff --git a/src/system.h b/src/system.h index 8f6a2ea84..1cd6bdb44 100644 --- a/src/system.h +++ b/src/system.h @@ -427,6 +427,15 @@ enum # define ATTRIBUTE_WARN_UNUSED_RESULT __attribute__ ((__warn_unused_result__)) #endif +#ifdef __GNUC__ +# define LIKELY(cond) __builtin_expect ((cond), 1) +# define UNLIKELY(cond) __builtin_expect ((cond), 0) +#else +# define LIKELY(cond) (cond) +# define UNLIKELY(cond) (cond) +#endif + + #if defined strdupa # define ASSIGN_STRDUPA(DEST, S) \ do { DEST = strdupa (S); } while (0) @@ -487,27 +496,54 @@ ptr_align (void const *ptr, size_t alignment) } /* Return whether the buffer consists entirely of NULs. - Note the word after the buffer must be non NUL. */ + Based on memeqzero in CCAN by Rusty Russell under CC0 (Public domain). */ static inline bool _GL_ATTRIBUTE_PURE -is_nul (void const *buf, size_t bufsize) +is_nul (void const *buf, size_t length) { - typedef uintptr_t word; - void const *vp; - char const *cbuf = buf; - word const *wp = buf; - - /* Find first nonzero *word*, or the word with the sentinel. */ - while (*wp++ == 0) - continue; - - /* Find the first nonzero *byte*, or the sentinel. */ - vp = wp - 1; - char const *cp = vp; - while (*cp++ == 0) - continue; - - return cbuf + bufsize < cp; + const unsigned char *p = buf; +/* Using possibly unaligned access for the first 16 bytes + saves about 30-40 cycles, though it is strictly undefined behavior + and so would need __attribute__ ((__no_sanitize_undefined__)) + to avoid -fsanitize=undefined warnings. + Considering coreutils is mainly concerned with relatively + large buffers, we'll just use the defined behavior. */ +#if 0 && _STRING_ARCH_unaligned + unsigned long word; +#else + unsigned char word; +#endif + + if (! length) + return true; + + /* Check len bytes not aligned on a word. */ + while (UNLIKELY (length & (sizeof word - 1))) + { + if (*p) + return false; + p++; + length--; + if (! length) + return true; + } + + /* Check up to 16 bytes a word at a time. */ + for (;;) + { + memcpy (&word, p, sizeof word); + if (word) + return false; + p += sizeof word; + length -= sizeof word; + if (! length) + return true; + if (UNLIKELY (length & 15) == 0) + break; + } + + /* Now we know first 16 bytes are NUL, memcmp with self. */ + return memcmp (buf, p, length) == 0; } /* If 10*Accum + Digit_val is larger than the maximum value for Type, -- cgit v1.2.3-54-g00ecf