Add fsm_detect_required_characters.

This inspects the DFA to determine which characters must appear in any matching input.
katef · Sep 12, 2024 · 75c051d · 75c051d
1 parent 981223f
commit 75c051d
Show file tree

Hide file tree

Showing 10 changed files with 528 additions and 0 deletions.
diff --git a/Makefile b/Makefile
@@ -116,6 +116,7 @@ SUBDIR += tests/intersect
 SUBDIR += tests/eclosure
 SUBDIR += tests/equals
 SUBDIR += tests/subtract
+SUBDIR += tests/detect_required
 SUBDIR += tests/determinise
 SUBDIR += tests/endids
 SUBDIR += tests/epsilons

diff --git a/include/fsm/walk.h b/include/fsm/walk.h
@@ -7,6 +7,8 @@
 #ifndef FSM_WALK_H
 #define FSM_WALK_H
 
+#include <adt/bitmap.h>
+
 struct fsm;
 struct fsm_state;
 
@@ -128,5 +130,34 @@ fsm_generate_matches_cb fsm_generate_cb_printf;
  * to escape all characters or just nonprintable ones. */
 fsm_generate_matches_cb fsm_generate_cb_printf_escaped;
 
+/* Walk a DFA and detect which characters MUST appear in the input for a
+ * match to be possible. For example, if input for the DFA corresponding
+ * to /^(abc|dbe)$/ does not contain 'b' at all, there's no way it can
+ * ever match, so executing the regex is unnecessary. This does not detect
+ * which characters must appear before/after others or how many times, just
+ * which must be present.
+ *
+ * The input must be a DFA. When run with EXPENSIVE_CHECKS this will
+ * check and return ERROR_MISUSE if it is not, otherwise this is an
+ * unchecked error.
+ *
+ * The bitmap will be cleared before populating. Afterward,
+ * bm_count(bitmap) will return how many required characters were
+ * found.
+ *
+ * There is an optional step_limit -- if this is reached, then it will
+ * return FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED and a
+ * cleared bitmap, because any partial information could still have been
+ * contradicted later. If the step_limit is 0 it will be ignored. */
+enum fsm_detect_required_characters_res {
+	FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN,
+	FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED,
+	FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE = -1,
+	FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC = -2,
+};
+enum fsm_detect_required_characters_res
+fsm_detect_required_characters(const struct fsm *dfa, size_t step_limit,
+    struct bm *bitmap);
+
 #endif
 
diff --git a/src/libfsm/Makefile b/src/libfsm/Makefile
@@ -7,6 +7,7 @@ SRC += src/libfsm/complete.c
 SRC += src/libfsm/consolidate.c
 SRC += src/libfsm/clone.c
 SRC += src/libfsm/closure.c
+SRC += src/libfsm/detect_required.c
 SRC += src/libfsm/edge.c
 SRC += src/libfsm/empty.c
 SRC += src/libfsm/end.c

diff --git a/src/libfsm/detect_required.c b/src/libfsm/detect_required.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright 2024 Scott Vokes
+ *
+ * See LICENCE for the full copyright terms.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+#include <ctype.h>
+
+#include <fsm/fsm.h>
+#include <fsm/walk.h>
+#include <fsm/pred.h>
+
+#include <adt/edgeset.h>
+#include <adt/u64bitset.h>
+
+#include "internal.h"
+
+#define LOG_BASE 0
+#define LOG_PROGRESS (LOG_BASE + 0)
+#define LOG_STEPS (LOG_BASE + 0)
+
+/* More than one label */
+#define LABEL_GROUP ((uint16_t)-1)
+
+struct dr_env {
+	const struct fsm *dfa;
+	size_t steps;
+
+	/* Number of times a unique label has been required -- this is a count so that going
+	 * from 0 <-> 1 can set/clear the accumulator, but going from 1 -> 2 etc. does not. */
+	size_t counts[256];
+	struct bm current;
+	bool first_end_state;
+	struct bm overall;
+
+	struct dr_stack {
+		size_t used;
+		size_t ceil;
+		struct stack_frame {
+			fsm_state_t state;
+			uint16_t label; /* unique label followed to get here, or LABEL_GROUP */
+			struct edge_group_iter iter;
+		} *frames;
+	} stack;
+};
+
+#define DEF_STACK_FRAMES 16
+
+/* Check symbols[]: if there's more than one bit set, then set label to
+ * LABEL_GROUP, otherwise set it to the single bit's character value.
+ * At least one bit must be set. */
+static void check_symbols(const struct edge_group_iter_info *info, uint16_t *label)
+{
+	bool any = false;
+
+	for (size_t i = 0; i < 256/64; i++) {
+		uint64_t w = info->symbols[i];
+		if (w == 0) { continue; }
+
+		/* get position of lowest set bit */
+		for (size_t b = 0; b < 64; b++) {
+			const uint64_t bit = 1ULL << b;
+			if (w & bit) {
+				if (any) {
+					*label = LABEL_GROUP;
+					return;
+				}
+
+				/* clear it, check if there's anything else set */
+				w &= ~bit;
+				if (w != 0) {
+					*label = LABEL_GROUP;
+					return;
+				}
+
+				*label = 64*i + b;
+				any = true;
+				break;
+			}
+		}
+	}
+
+	/* there must be at least one bit set */
+	assert(any);
+}
+
+/* Walk a DFA and attempt to detect which characters must appear in any input to match.
+ * This finds the intersection of characters required on any start->end paths (tracking
+ * edges with only one label that must be followed by all matches), so it can take
+ * prohibitively long for large/complex DFAs. */
+enum fsm_detect_required_characters_res
+fsm_detect_required_characters(const struct fsm *dfa, size_t step_limit, struct bm *bitmap)
+{
+	assert(dfa != NULL);
+	assert(bitmap != NULL);
+
+	#if EXPENSIVE_CHECKS
+	if (!fsm_all(dfa, fsm_isdfa)) {
+		return FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE;
+	}
+	#endif
+
+	enum fsm_detect_required_characters_res res = FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC;
+
+	struct dr_env env = {
+		.dfa = dfa,
+		.first_end_state = true,
+	};
+
+	assert(env.counts[0] == 0);
+
+	const size_t state_count = fsm_countstates(dfa);
+	fsm_state_t start_state;
+	if (!fsm_getstart(dfa, &start_state)) {
+		res = FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE;
+		goto cleanup;
+	}
+
+	#if EXPENSIVE_CHECKS
+	for (fsm_state_t s = 0; s < state_count; s++) {
+		assert(!dfa->states[s].visited);
+	}
+	#endif
+
+	bm_clear(bitmap);
+
+	/* If the start state is also an end state, then
+	 * it matches the empty string, so we're done. */
+	if (fsm_isend(dfa, start_state)) {
+		res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN;
+		goto cleanup;
+	}
+
+	env.stack.frames = f_malloc(dfa->alloc, DEF_STACK_FRAMES * sizeof(env.stack.frames[0]));
+	if (env.stack.frames == NULL) { goto cleanup; }
+	env.stack.ceil = DEF_STACK_FRAMES;
+
+	{			/* set up start state's stack frame */
+		struct stack_frame *sf0 = &env.stack.frames[0];
+		sf0->state = start_state;
+		sf0->label = LABEL_GROUP;
+
+		dfa->states[start_state].visited = true;
+
+		edge_set_group_iter_reset(dfa->states[start_state].edges,
+		    EDGE_GROUP_ITER_ALL, &sf0->iter);
+		env.stack.used = 1;
+	}
+
+	while (env.stack.used > 0) {
+		struct stack_frame *sf = &env.stack.frames[env.stack.used - 1];
+		struct edge_group_iter_info info;
+		env.steps++;
+		if (LOG_STEPS > 1) {
+			fprintf(stderr, "-- steps %zu/%zu\n", env.steps, step_limit);
+		}
+		if (env.steps == step_limit) {
+			res = FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED;
+			goto cleanup;
+		}
+
+		if (edge_set_group_iter_next(&sf->iter, &info)) {
+			assert(info.to < state_count);
+			if (dfa->states[info.to].visited) {
+				continue; /* skip visited state */
+			}
+
+			if (env.stack.used == env.stack.ceil) { /* grow stack */
+				const size_t nceil = 2*env.stack.ceil;
+				assert(nceil > env.stack.ceil);
+				struct stack_frame *nframes = f_realloc(dfa->alloc,
+				    env.stack.frames, nceil * sizeof(nframes[0]));
+				if (nframes == NULL) {
+					return FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC;
+				}
+
+				env.stack.frames = nframes;
+				env.stack.ceil = nceil;
+			}
+
+			/* enter state */
+			dfa->states[info.to].visited = true;
+
+			struct stack_frame *nsf = &env.stack.frames[env.stack.used];
+			nsf->state = info.to;
+			check_symbols(&info, &nsf->label);
+
+			if (nsf->label != LABEL_GROUP) {
+				size_t offset = (nsf->label & 0xff);
+				size_t count = ++env.counts[offset];
+				if (count == 1) {
+					bm_set(&env.current, offset);
+				}
+			}
+
+			edge_set_group_iter_reset(dfa->states[info.to].edges,
+			    EDGE_GROUP_ITER_ALL, &nsf->iter);
+			env.stack.used++;
+
+			if (fsm_isend(dfa, info.to)) {
+				if (env.first_end_state) {
+					bm_copy(&env.overall, &env.current);
+					env.first_end_state = false;
+				} else { /* intersect */
+					bm_intersect(&env.overall, &env.current);
+				}
+
+				if (LOG_PROGRESS) {
+					fprintf(stderr, "-- current: ");
+					bm_print(stderr, NULL, &env.current, 0, fsm_escputc);
+					fprintf(stderr, ", overall: ");
+					bm_print(stderr, NULL, &env.overall, 0, fsm_escputc);
+					fprintf(stderr, "\n");
+				}
+
+				/* Intersecting with the empty set will always be empty, so
+				 * further exploration is unnecessary. */
+				if (!bm_any(&env.overall)) {
+					res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN;
+					break;
+				}
+			}
+
+		} else {	/* done with state */
+			/* If this state was reached via a unique label, then
+			 * reduce the count. If the count returns to 0, remove
+			 * it from the constraint set. */
+			if (sf->label != LABEL_GROUP) {
+				size_t offset = (sf->label & 0xff);
+				size_t count = --env.counts[offset];
+				if (count == 0) {
+					bm_unset(&env.current, offset);
+				}
+			}
+
+			/* clear visited */
+			dfa->states[sf->state].visited = false;
+
+			env.stack.used--;
+		}
+	}
+
+	if (LOG_STEPS) {
+		fprintf(stderr, "%s: finished in %zu/%zu steps\n", __func__, env.steps, step_limit);
+	}
+
+	res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN;
+	bm_copy(bitmap, &env.overall);
+
+cleanup:
+	f_free(dfa->alloc, env.stack.frames);
+
+	for (fsm_state_t s = 0; s < state_count; s++) {
+		dfa->states[s].visited = false;
+	}
+
+	return res;
+}
diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms
@@ -16,6 +16,7 @@ fsm_reachableall
 fsm_reachableany
 fsm_walk_edges
 fsm_walk_states
+fsm_detect_required_characters
 
 # <fsm/pred.h>
 fsm_epsilonsonly

diff --git a/tests/detect_required/Makefile b/tests/detect_required/Makefile
@@ -0,0 +1,26 @@
+.include "../../share/mk/top.mk"
+
+TEST.tests/detect_required != ls -1 tests/detect_required/detect_required*.c
+TEST_SRCDIR.tests/detect_required = tests/detect_required
+TEST_OUTDIR.tests/detect_required = ${BUILD}/tests/detect_required
+
+.for n in ${TEST.tests/detect_required:T:R:C/^detect_required//}
+test:: ${TEST_OUTDIR.tests/detect_required}/res${n}
+SRC += ${TEST_SRCDIR.tests/detect_required}/detect_required${n}.c
+CFLAGS.${TEST_SRCDIR.tests/detect_required}/detect_required${n}.c = -UNDEBUG
+
+${TEST_OUTDIR.tests/detect_required}/run${n}: ${TEST_OUTDIR.tests/detect_required}/detect_required${n}.o ${TEST_OUTDIR.tests/detect_required}/testutil.o
+	${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/detect_required}/run${n} ${TEST_OUTDIR.tests/detect_required}/detect_required${n}.o ${TEST_OUTDIR.tests/detect_required}/testutil.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a
+
+${TEST_OUTDIR.tests/detect_required}/detect_required${n}.o: tests/detect_required/testutil.h
+
+${TEST_OUTDIR.tests/detect_required}/res${n}: ${TEST_OUTDIR.tests/detect_required}/run${n}
+	( ${TEST_OUTDIR.tests/detect_required}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/detect_required}/res${n}
+
+.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre}
+${TEST_OUTDIR.tests/detect_required}/run${n}: ${BUILD}/lib/${lib:R}.a
+.endfor
+.endfor
+
+${TEST_OUTDIR.tests/detect_required}/testutil.o: tests/detect_required/testutil.c
+	${CC} ${CFLAGS} -c -o ${TEST_OUTDIR.tests/detect_required}/testutil.o tests/detect_required/testutil.c
diff --git a/tests/detect_required/detect_required1.c b/tests/detect_required/detect_required1.c
@@ -0,0 +1,32 @@
+#include "testutil.h"
+
+const struct testcase tests[] = {
+	{ .regex = "^$", .required = "" },
+	{ .regex = "^a$", .required = "a" },
+	{ .regex = "^abcde$", .required = "abcde" },
+	{ .regex = "^(ab|cd)$", .required = "" },
+	{ .regex = "^(ab|cd|ef)$", .required = "" },
+	{ .regex = "^(abc|def)$", .required = "" },
+	{ .regex = "^(abc|dbf)$", .required = "b" },
+	{ .regex = "^abc(def)*ghi$", .required = "abcghi" },
+	{ .regex = "^abc(def)+ghi$", .required = "abcdefghi" },
+	{ .regex = "^ghi(def)abc$", .required = "abcdefghi" },
+};
+
+int main()
+{
+	const bool first_fail = getenv("FIRST_FAIL") != NULL;
+	const size_t testcount = sizeof(tests)/sizeof(tests[0]);
+
+	size_t failures = 0;
+	for (size_t i = 0; i < testcount; i++) {
+		if (!run_test(&tests[i])) {
+			failures++;
+			if (first_fail) { break; }
+		}
+	}
+
+	return failures == 0
+	    ? EXIT_SUCCESS
+	    : EXIT_FAILURE;
+}