Skip to content

Commit

Permalink
Add fsm_detect_required_characters.
Browse files Browse the repository at this point in the history
This inspects the DFA to determine which characters must appear in
any matching input.
  • Loading branch information
silentbicycle committed Sep 12, 2024
1 parent 981223f commit 75c051d
Show file tree
Hide file tree
Showing 10 changed files with 528 additions and 0 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ SUBDIR += tests/intersect
SUBDIR += tests/eclosure
SUBDIR += tests/equals
SUBDIR += tests/subtract
SUBDIR += tests/detect_required
SUBDIR += tests/determinise
SUBDIR += tests/endids
SUBDIR += tests/epsilons
Expand Down
31 changes: 31 additions & 0 deletions include/fsm/walk.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#ifndef FSM_WALK_H
#define FSM_WALK_H

#include <adt/bitmap.h>

struct fsm;
struct fsm_state;

Expand Down Expand Up @@ -128,5 +130,34 @@ fsm_generate_matches_cb fsm_generate_cb_printf;
* to escape all characters or just nonprintable ones. */
fsm_generate_matches_cb fsm_generate_cb_printf_escaped;

/* Walk a DFA and detect which characters MUST appear in the input for a
* match to be possible. For example, if input for the DFA corresponding
* to /^(abc|dbe)$/ does not contain 'b' at all, there's no way it can
* ever match, so executing the regex is unnecessary. This does not detect
* which characters must appear before/after others or how many times, just
* which must be present.
*
* The input must be a DFA. When run with EXPENSIVE_CHECKS this will
* check and return ERROR_MISUSE if it is not, otherwise this is an
* unchecked error.
*
* The bitmap will be cleared before populating. Afterward,
* bm_count(bitmap) will return how many required characters were
* found.
*
* There is an optional step_limit -- if this is reached, then it will
* return FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED and a
* cleared bitmap, because any partial information could still have been
* contradicted later. If the step_limit is 0 it will be ignored. */
enum fsm_detect_required_characters_res {
FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN,
FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED,
FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE = -1,
FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC = -2,
};
enum fsm_detect_required_characters_res
fsm_detect_required_characters(const struct fsm *dfa, size_t step_limit,
struct bm *bitmap);

#endif

1 change: 1 addition & 0 deletions src/libfsm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ SRC += src/libfsm/complete.c
SRC += src/libfsm/consolidate.c
SRC += src/libfsm/clone.c
SRC += src/libfsm/closure.c
SRC += src/libfsm/detect_required.c
SRC += src/libfsm/edge.c
SRC += src/libfsm/empty.c
SRC += src/libfsm/end.c
Expand Down
264 changes: 264 additions & 0 deletions src/libfsm/detect_required.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
/*
* Copyright 2024 Scott Vokes
*
* See LICENCE for the full copyright terms.
*/

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>

#include <fsm/fsm.h>
#include <fsm/walk.h>
#include <fsm/pred.h>

#include <adt/edgeset.h>
#include <adt/u64bitset.h>

#include "internal.h"

#define LOG_BASE 0
#define LOG_PROGRESS (LOG_BASE + 0)
#define LOG_STEPS (LOG_BASE + 0)

/* More than one label */
#define LABEL_GROUP ((uint16_t)-1)

struct dr_env {
const struct fsm *dfa;
size_t steps;

/* Number of times a unique label has been required -- this is a count so that going
* from 0 <-> 1 can set/clear the accumulator, but going from 1 -> 2 etc. does not. */
size_t counts[256];
struct bm current;
bool first_end_state;
struct bm overall;

struct dr_stack {
size_t used;
size_t ceil;
struct stack_frame {
fsm_state_t state;
uint16_t label; /* unique label followed to get here, or LABEL_GROUP */
struct edge_group_iter iter;
} *frames;
} stack;
};

#define DEF_STACK_FRAMES 16

/* Check symbols[]: if there's more than one bit set, then set label to
* LABEL_GROUP, otherwise set it to the single bit's character value.
* At least one bit must be set. */
static void check_symbols(const struct edge_group_iter_info *info, uint16_t *label)
{
bool any = false;

for (size_t i = 0; i < 256/64; i++) {
uint64_t w = info->symbols[i];
if (w == 0) { continue; }

/* get position of lowest set bit */
for (size_t b = 0; b < 64; b++) {
const uint64_t bit = 1ULL << b;
if (w & bit) {
if (any) {
*label = LABEL_GROUP;
return;
}

/* clear it, check if there's anything else set */
w &= ~bit;
if (w != 0) {
*label = LABEL_GROUP;
return;
}

*label = 64*i + b;
any = true;
break;
}
}
}

/* there must be at least one bit set */
assert(any);
}

/* Walk a DFA and attempt to detect which characters must appear in any input to match.
* This finds the intersection of characters required on any start->end paths (tracking
* edges with only one label that must be followed by all matches), so it can take
* prohibitively long for large/complex DFAs. */
enum fsm_detect_required_characters_res
fsm_detect_required_characters(const struct fsm *dfa, size_t step_limit, struct bm *bitmap)
{
assert(dfa != NULL);
assert(bitmap != NULL);

#if EXPENSIVE_CHECKS
if (!fsm_all(dfa, fsm_isdfa)) {
return FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE;
}
#endif

enum fsm_detect_required_characters_res res = FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC;

struct dr_env env = {
.dfa = dfa,
.first_end_state = true,
};

assert(env.counts[0] == 0);

const size_t state_count = fsm_countstates(dfa);
fsm_state_t start_state;
if (!fsm_getstart(dfa, &start_state)) {
res = FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE;
goto cleanup;
}

#if EXPENSIVE_CHECKS
for (fsm_state_t s = 0; s < state_count; s++) {
assert(!dfa->states[s].visited);
}
#endif

bm_clear(bitmap);

/* If the start state is also an end state, then
* it matches the empty string, so we're done. */
if (fsm_isend(dfa, start_state)) {
res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN;
goto cleanup;
}

env.stack.frames = f_malloc(dfa->alloc, DEF_STACK_FRAMES * sizeof(env.stack.frames[0]));
if (env.stack.frames == NULL) { goto cleanup; }
env.stack.ceil = DEF_STACK_FRAMES;

{ /* set up start state's stack frame */
struct stack_frame *sf0 = &env.stack.frames[0];
sf0->state = start_state;
sf0->label = LABEL_GROUP;

dfa->states[start_state].visited = true;

edge_set_group_iter_reset(dfa->states[start_state].edges,
EDGE_GROUP_ITER_ALL, &sf0->iter);
env.stack.used = 1;
}

while (env.stack.used > 0) {
struct stack_frame *sf = &env.stack.frames[env.stack.used - 1];
struct edge_group_iter_info info;
env.steps++;
if (LOG_STEPS > 1) {
fprintf(stderr, "-- steps %zu/%zu\n", env.steps, step_limit);
}
if (env.steps == step_limit) {
res = FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED;
goto cleanup;
}

if (edge_set_group_iter_next(&sf->iter, &info)) {
assert(info.to < state_count);
if (dfa->states[info.to].visited) {
continue; /* skip visited state */
}

if (env.stack.used == env.stack.ceil) { /* grow stack */
const size_t nceil = 2*env.stack.ceil;
assert(nceil > env.stack.ceil);
struct stack_frame *nframes = f_realloc(dfa->alloc,
env.stack.frames, nceil * sizeof(nframes[0]));
if (nframes == NULL) {
return FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC;
}

env.stack.frames = nframes;
env.stack.ceil = nceil;
}

/* enter state */
dfa->states[info.to].visited = true;

struct stack_frame *nsf = &env.stack.frames[env.stack.used];
nsf->state = info.to;
check_symbols(&info, &nsf->label);

if (nsf->label != LABEL_GROUP) {
size_t offset = (nsf->label & 0xff);
size_t count = ++env.counts[offset];
if (count == 1) {
bm_set(&env.current, offset);
}
}

edge_set_group_iter_reset(dfa->states[info.to].edges,
EDGE_GROUP_ITER_ALL, &nsf->iter);
env.stack.used++;

if (fsm_isend(dfa, info.to)) {
if (env.first_end_state) {
bm_copy(&env.overall, &env.current);
env.first_end_state = false;
} else { /* intersect */
bm_intersect(&env.overall, &env.current);
}

if (LOG_PROGRESS) {
fprintf(stderr, "-- current: ");
bm_print(stderr, NULL, &env.current, 0, fsm_escputc);
fprintf(stderr, ", overall: ");
bm_print(stderr, NULL, &env.overall, 0, fsm_escputc);
fprintf(stderr, "\n");
}

/* Intersecting with the empty set will always be empty, so
* further exploration is unnecessary. */
if (!bm_any(&env.overall)) {
res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN;
break;
}
}

} else { /* done with state */
/* If this state was reached via a unique label, then
* reduce the count. If the count returns to 0, remove
* it from the constraint set. */
if (sf->label != LABEL_GROUP) {
size_t offset = (sf->label & 0xff);
size_t count = --env.counts[offset];
if (count == 0) {
bm_unset(&env.current, offset);
}
}

/* clear visited */
dfa->states[sf->state].visited = false;

env.stack.used--;
}
}

if (LOG_STEPS) {
fprintf(stderr, "%s: finished in %zu/%zu steps\n", __func__, env.steps, step_limit);
}

res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN;
bm_copy(bitmap, &env.overall);

cleanup:
f_free(dfa->alloc, env.stack.frames);

for (fsm_state_t s = 0; s < state_count; s++) {
dfa->states[s].visited = false;
}

return res;
}
1 change: 1 addition & 0 deletions src/libfsm/libfsm.syms
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ fsm_reachableall
fsm_reachableany
fsm_walk_edges
fsm_walk_states
fsm_detect_required_characters

# <fsm/pred.h>
fsm_epsilonsonly
Expand Down
26 changes: 26 additions & 0 deletions tests/detect_required/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
.include "../../share/mk/top.mk"

TEST.tests/detect_required != ls -1 tests/detect_required/detect_required*.c
TEST_SRCDIR.tests/detect_required = tests/detect_required
TEST_OUTDIR.tests/detect_required = ${BUILD}/tests/detect_required

.for n in ${TEST.tests/detect_required:T:R:C/^detect_required//}
test:: ${TEST_OUTDIR.tests/detect_required}/res${n}
SRC += ${TEST_SRCDIR.tests/detect_required}/detect_required${n}.c
CFLAGS.${TEST_SRCDIR.tests/detect_required}/detect_required${n}.c = -UNDEBUG

${TEST_OUTDIR.tests/detect_required}/run${n}: ${TEST_OUTDIR.tests/detect_required}/detect_required${n}.o ${TEST_OUTDIR.tests/detect_required}/testutil.o
${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/detect_required}/run${n} ${TEST_OUTDIR.tests/detect_required}/detect_required${n}.o ${TEST_OUTDIR.tests/detect_required}/testutil.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a

${TEST_OUTDIR.tests/detect_required}/detect_required${n}.o: tests/detect_required/testutil.h

${TEST_OUTDIR.tests/detect_required}/res${n}: ${TEST_OUTDIR.tests/detect_required}/run${n}
( ${TEST_OUTDIR.tests/detect_required}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/detect_required}/res${n}

.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre}
${TEST_OUTDIR.tests/detect_required}/run${n}: ${BUILD}/lib/${lib:R}.a
.endfor
.endfor

${TEST_OUTDIR.tests/detect_required}/testutil.o: tests/detect_required/testutil.c
${CC} ${CFLAGS} -c -o ${TEST_OUTDIR.tests/detect_required}/testutil.o tests/detect_required/testutil.c
32 changes: 32 additions & 0 deletions tests/detect_required/detect_required1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "testutil.h"

const struct testcase tests[] = {
{ .regex = "^$", .required = "" },
{ .regex = "^a$", .required = "a" },
{ .regex = "^abcde$", .required = "abcde" },
{ .regex = "^(ab|cd)$", .required = "" },
{ .regex = "^(ab|cd|ef)$", .required = "" },
{ .regex = "^(abc|def)$", .required = "" },
{ .regex = "^(abc|dbf)$", .required = "b" },
{ .regex = "^abc(def)*ghi$", .required = "abcghi" },
{ .regex = "^abc(def)+ghi$", .required = "abcdefghi" },
{ .regex = "^ghi(def)abc$", .required = "abcdefghi" },
};

int main()
{
const bool first_fail = getenv("FIRST_FAIL") != NULL;
const size_t testcount = sizeof(tests)/sizeof(tests[0]);

size_t failures = 0;
for (size_t i = 0; i < testcount; i++) {
if (!run_test(&tests[i])) {
failures++;
if (first_fail) { break; }
}
}

return failures == 0
? EXIT_SUCCESS
: EXIT_FAILURE;
}
Loading

0 comments on commit 75c051d

Please sign in to comment.