diff options
author | Holger Weiss <holger@zedat.fu-berlin.de> | 2013-08-18 11:03:36 +0200 |
---|---|---|
committer | Holger Weiss <holger@zedat.fu-berlin.de> | 2013-08-18 11:03:36 +0200 |
commit | f3dbc2ec871da22028969540424a63ff51404cfd (patch) | |
tree | 4225e3007008839ec710fcc17ed6cc9bf8db748c /gl/str-two-way.h | |
parent | 5c8dd483ccc5f4cf2d86c4ae386efb86e3b259cd (diff) | |
download | monitoring-plugins-f3dbc2ec871da22028969540424a63ff51404cfd.tar.gz |
Sync with the latest Gnulib code (6f2d632)
Diffstat (limited to 'gl/str-two-way.h')
-rw-r--r-- | gl/str-two-way.h | 59 |
1 files changed, 40 insertions, 19 deletions
diff --git a/gl/str-two-way.h b/gl/str-two-way.h index c08f60ef..4d555f92 100644 --- a/gl/str-two-way.h +++ b/gl/str-two-way.h @@ -95,26 +95,37 @@ A critical factorization has the property that the local period equals the global period. All strings have at least one critical factorization with the left half smaller than the global period. + And while some strings have more than one critical factorization, + it is provable that with an ordered alphabet, at least one of the + critical factorizations corresponds to a maximal suffix. Given an ordered alphabet, a critical factorization can be computed in linear time, with 2 * NEEDLE_LEN comparisons, by computing the - larger of two ordered maximal suffixes. The ordered maximal - suffixes are determined by lexicographic comparison of + shorter of two ordered maximal suffixes. The ordered maximal + suffixes are determined by lexicographic comparison while tracking periodicity. */ static size_t critical_factorization (const unsigned char *needle, size_t needle_len, size_t *period) { - /* Index of last byte of left half, or SIZE_MAX. */ + /* Index of last byte of left half. */ size_t max_suffix, max_suffix_rev; size_t j; /* Index into NEEDLE for current candidate suffix. */ size_t k; /* Offset into current period. */ size_t p; /* Intermediate period. */ unsigned char a, b; /* Current comparison bytes. */ + /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered + out 0-length needles. */ + if (needle_len < 3) + { + *period = 1; + return needle_len - 1; + } + /* Invariants: - 0 <= j < NEEDLE_LEN - 1 - -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) + 1 <= j < NEEDLE_LEN - 1 + 0 <= max_suffix{,_rev} < j min(max_suffix, max_suffix_rev) < global period of NEEDLE 1 <= p <= global period of NEEDLE p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] @@ -122,9 +133,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len, */ /* Perform lexicographic search. */ - max_suffix = SIZE_MAX; - j = 0; - k = p = 1; + max_suffix = 0; + j = k = p = 1; while (j + k < needle_len) { a = CANON_ELEMENT (needle[j + k]); @@ -157,9 +167,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len, *period = p; /* Perform reverse lexicographic search. */ - max_suffix_rev = SIZE_MAX; - j = 0; - k = p = 1; + max_suffix_rev = 0; + j = k = p = 1; while (j + k < needle_len) { a = CANON_ELEMENT (needle[j + k]); @@ -190,8 +199,20 @@ critical_factorization (const unsigned char *needle, size_t needle_len, } } - /* Choose the longer suffix. Return the first byte of the right - half, rather than the last byte of the left half. */ + /* Choose the shorter suffix. Return the index of the first byte of + the right half, rather than the last byte of the left half. + + For some examples, 'banana' has two critical factorizations, both + exposed by the two lexicographic extreme suffixes of 'anana' and + 'nana', where both suffixes have a period of 2. On the other + hand, with 'aab' and 'bba', both strings have a single critical + factorization of the last byte, with the suffix having a period + of 1. While the maximal lexicographic suffix of 'aab' is 'b', + the maximal lexicographic suffix of 'bba' is 'ba', which is not a + critical factorization. Conversely, the maximal reverse + lexicographic suffix of 'a' works for 'bba', but not 'ab' for + 'aab'. The shorter suffix of the two will always be a critical + factorization. */ if (max_suffix_rev + 1 < max_suffix + 1) return max_suffix + 1; *period = p; @@ -226,9 +247,9 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, first. */ if (CMP_FUNC (needle, needle + period, suffix) == 0) { - /* Entire needle is periodic; a mismatch can only advance by the - period, so use memory to avoid rescanning known occurrences - of the period. */ + /* Entire needle is periodic; a mismatch in the left half can + only advance by the period, so use memory to avoid rescanning + known occurrences of the period in the right half. */ size_t memory = 0; j = 0; while (AVAILABLE (haystack, haystack_len, j, needle_len)) @@ -330,9 +351,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, first. */ if (CMP_FUNC (needle, needle + period, suffix) == 0) { - /* Entire needle is periodic; a mismatch can only advance by the - period, so use memory to avoid rescanning known occurrences - of the period. */ + /* Entire needle is periodic; a mismatch in the left half can + only advance by the period, so use memory to avoid rescanning + known occurrences of the period in the right half. */ size_t memory = 0; size_t shift; j = 0; |