From e17e5f40b81447a2af65b0e64a3295d5e2e86753 Mon Sep 17 00:00:00 2001 From: Pádraig Brady Date: Thu, 22 Dec 2016 14:31:44 +0000 Subject: wc: with only --bytes, determine size more efficiently * src/wc.c (wc): Avoid reading the end of the file when the size is not a multiple of PAGE_SIZE, as the special case handling for files in /proc and /sys is only required when st_size is 0 or a multiple of PAGE_SIZE. * tests/misc/wc-proc.sh: Add a test case. --- src/wc.c | 48 +++++++++++++++++++++++++++++++++++++----------- tests/misc/wc-proc.sh | 13 +++++++++++++ 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/src/wc.c b/src/wc.c index 64df50cd9..a02379bf8 100644 --- a/src/wc.c +++ b/src/wc.c @@ -71,6 +71,9 @@ static int number_width; /* True if we have ever read the standard input. */ static bool have_read_stdin; +/* Used to determine if file size can be determined without reading. */ +static size_t page_size; + /* The result of calling fstat or stat on a file descriptor or file. */ struct fstatus { @@ -235,6 +238,8 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) if (count_bytes && !count_chars && !print_lines && !count_complicated) { + bool skip_read = false; + if (0 < fstatus->failed) fstatus->failed = fstat (fd, &fstatus->st); @@ -245,24 +250,44 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) && 0 <= fstatus->st.st_size) { size_t end_pos = fstatus->st.st_size; - off_t hi_pos = end_pos - end_pos % (ST_BLKSIZE (fstatus->st) + 1); if (current_pos < 0) current_pos = lseek (fd, 0, SEEK_CUR); - if (0 <= current_pos && current_pos < hi_pos - && 0 <= lseek (fd, hi_pos, SEEK_CUR)) - bytes = hi_pos - current_pos; + + if (end_pos % page_size) + { + /* We only need special handling of /proc and /sys files etc. + when they're a multiple of PAGE_SIZE. In the common case + for files with st_size not a multiple of PAGE_SIZE, + it's more efficient and accurate to use st_size. + + Be careful here. The current position may actually be + beyond the end of the file. As in the example above. */ + + bytes = end_pos < current_pos ? 0 : end_pos - current_pos; + skip_read = true; + } + else + { + off_t hi_pos = end_pos - end_pos % (ST_BLKSIZE (fstatus->st) + 1); + if (0 <= current_pos && current_pos < hi_pos + && 0 <= lseek (fd, hi_pos, SEEK_CUR)) + bytes = hi_pos - current_pos; + } } - fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL); - while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0) + if (! skip_read) { - if (bytes_read == SAFE_READ_ERROR) + fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL); + while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0) { - error (0, errno, "%s", quotef (file)); - ok = false; - break; + if (bytes_read == SAFE_READ_ERROR) + { + error (0, errno, "%s", quotef (file)); + ok = false; + break; + } + bytes += bytes_read; } - bytes += bytes_read; } } else if (!count_chars && !count_complicated) @@ -639,6 +664,7 @@ main (int argc, char **argv) atexit (close_stdout); + page_size = getpagesize (); /* Line buffer stdout to ensure lines are written atomically and immediately so that processes running in parallel do not intersperse their output. */ setvbuf (stdout, NULL, _IOLBF, 0); diff --git a/tests/misc/wc-proc.sh b/tests/misc/wc-proc.sh index d6a36ba18..c50d7832b 100755 --- a/tests/misc/wc-proc.sh +++ b/tests/misc/wc-proc.sh @@ -19,6 +19,7 @@ . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src print_ver_ wc +# Ensure we read() /proc files to determine content length for file in /proc/version /sys/kernel/profiling; do if test -r $file; then cp -f $file copy && @@ -29,4 +30,16 @@ for file in /proc/version /sys/kernel/profiling; do fi done +# Ensure we handle cases where we don't read() +truncate -s 2 no_read || framework_failure_ +# read() used when multiple of page size +truncate -s 1048576 do_read || framework_failure_ +wc -c no_read do_read > out || fail=1 +cat <<\EOF > exp + 2 no_read +1048576 do_read +1048578 total +EOF +compare exp out || fail=1 + Exit $fail -- cgit v1.2.3-54-g00ecf