summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPádraig Brady <P@draigBrady.com>2013-04-28 23:27:12 +0100
committerPádraig Brady <P@draigBrady.com>2013-04-29 17:54:39 +0100
commit791919f6d9a873ae7452a7e1d71e2fe9e0fd4104 (patch)
treea7a5b0dc85ce57bcd90e27856db758ca04ffec8d
parentef9db5735a401f60eb5b4a18a365bf1ece525053 (diff)
downloadcoreutils-791919f6d9a873ae7452a7e1d71e2fe9e0fd4104.tar.xz
cut: reduce CPU usage for the the common case
Ensure appropriate functions are inlined. This was seen to be required with gcc 4.6.0 with -O2 on x86_64 at least. It was reported that gcc 4.8.0 did inline these functions though. Also reinstate the bit vector for the common case, to further improve performance. Benchmark results for both aspects of this change are: $ yes abcdfeg | head -n1MB > big-file $ for c in orig inline inline-array; do src/cut-$c 2>/dev/null echo -ne "\n== $c ==" time src/cut-$c -b1,3 big-file > /dev/null done == orig == real 0m0.088s user 0m0.081s sys 0m0.007s == inline == real 0m0.070s user 0m0.060s sys 0m0.009s == inline-array == real 0m0.049s user 0m0.044s sys 0m0.005s * src/cut.c (set_fields): Set up the printable_field bit vector for performance, but only when it's appropriate. I.E. not when either --output-delimeter or huge ranges are specified. (next_item): Ensure it's inlined and avoid unnecessary processing. (print_kth): Ensure it's inlined and add a branch for the fast path. Related to http://bugs.gnu.org/13127
-rw-r--r--src/cut.c80
1 files changed, 74 insertions, 6 deletions
diff --git a/src/cut.c b/src/cut.c
index 37615a34a..b347b307b 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -108,11 +108,31 @@ static char *field_1_buffer;
/* The number of bytes allocated for FIELD_1_BUFFER. */
static size_t field_1_bufsize;
+/* The largest field or byte index used as an endpoint of a closed
+ or degenerate range specification; this doesn't include the starting
+ index of right-open-ended ranges. For example, with either range spec
+ '2-5,9-', '2-3,5,9-' this variable would be set to 5. */
+static size_t max_range_endpoint;
/* If nonzero, this is the index of the first field in a range that goes
to end of line. */
static size_t eol_range_start;
+/* This is a bit vector.
+ In byte mode, which bytes to output.
+ In field mode, which DELIM-separated fields to output.
+ Both bytes and fields are numbered starting with 1,
+ so the zeroth bit of this array is unused.
+ A field or byte K has been selected if
+ (K <= MAX_RANGE_ENDPOINT && is_printable_field (K))
+ || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
+static unsigned char *printable_field;
+
+/* The maximum size the printable_field array to allocate.
+ For ranges requiring more than this, we revert to the slightly
+ slower mechanism of inspecting the current range pair limits. */
+enum { PRINTABLE_ARRAY_MAX = 65536 };
+
enum operating_mode
{
undefined_mode,
@@ -233,14 +253,35 @@ With no FILE, or when FILE is -, read standard input.\n\
exit (status);
}
-/* Return nonzero if the K'th field or byte is printable. */
+static inline void
+mark_printable_field (size_t i)
+{
+ size_t n = i / CHAR_BIT;
+ printable_field[n] |= (1 << (i % CHAR_BIT));
+}
-static bool
+static inline bool
+is_printable_field (size_t i)
+{
+ size_t n = i / CHAR_BIT;
+ return (printable_field[n] >> (i % CHAR_BIT)) & 1;
+}
+
+/* Return nonzero if the K'th field or byte is printable.
+ Note this is a "hot" function. Please profile when changing. */
+
+static inline bool
print_kth (size_t k)
{
bool k_selected = false;
+
if (0 < eol_range_start && eol_range_start <= k)
k_selected = true;
+ else if (printable_field) /* faster path for smaller ranges. */
+ {
+ if (k <= max_range_endpoint && is_printable_field (k))
+ k_selected = true;
+ }
else if (current_rp->lo <= k && k <= current_rp->hi)
k_selected = true;
@@ -275,12 +316,14 @@ compare_ranges (const void *a, const void *b)
/* Increment *ITEM_IDX (i.e. a field or byte index),
and if required CURRENT_RP. */
-static void
+static inline void
next_item (size_t *item_idx)
{
(*item_idx)++;
- if ((*item_idx > current_rp->hi) && (current_rp < rp + n_rp - 1))
- current_rp++;
+ /* avoid extra processing associated with current_rp unless needed. */
+ if (!printable_field)
+ if ((*item_idx > current_rp->hi) && (current_rp < rp + n_rp - 1))
+ current_rp++;
}
/* Given the list of field or byte range specifications FIELDSTR,
@@ -412,6 +455,24 @@ set_fields (const char *fieldstr)
FATAL_ERROR (_("invalid byte, character or field list"));
}
+ max_range_endpoint = 0;
+ for (i = 0; i < n_rp; i++)
+ {
+ if (rp[i].hi > max_range_endpoint)
+ max_range_endpoint = rp[i].hi;
+ }
+
+ /* For performance, allocate an array large enough so that it may be
+ indexed by the field numbers corresponding to all finite ranges
+ (i.e. '2-6' or '-4', but not '5-') in FIELDSTR.
+ Note this enhancement is not possible with very large ranges,
+ or when --output-delimiter is specified. */
+
+ if (!output_delimiter_specified
+ && max_range_endpoint
+ && max_range_endpoint / CHAR_BIT < PRINTABLE_ARRAY_MAX)
+ printable_field = xzalloc (max_range_endpoint / CHAR_BIT + 1);
+
qsort (rp, n_rp, sizeof (rp[0]), compare_ranges);
/* Omit finite ranges subsumed by a to-EOL range. */
@@ -426,7 +487,8 @@ set_fields (const char *fieldstr)
}
}
- /* Merge finite range pairs (e.g. `2-5,3-4' becomes `2-5'). */
+ /* Merge finite range pairs (e.g. `2-5,3-4' becomes `2-5').
+ Also for small enough ranges, mark items as printable. */
for (i = 0; i < n_rp; ++i)
{
for (size_t j = i + 1; j < n_rp; ++j)
@@ -441,6 +503,12 @@ set_fields (const char *fieldstr)
else
break;
}
+
+ if (printable_field)
+ {
+ for (size_t k = rp[i].lo; k <= rp[i].hi; k++)
+ mark_printable_field (k);
+ }
}
/* After merging, reallocate RP so we release memory to the system.