From 81a5b6485f85a86c19b274792088be98ff381de8 Mon Sep 17 00:00:00 2001 From: Eloy Coto Date: Tue, 14 Nov 2023 20:47:59 +0100 Subject: [PATCH 1/3] refactor: move bsearch function to C-code This commit fixes issue #527 and move the bsearch function to a native C-code. The performance is a bit better: Testing script: ```bash clear if [[ `uname` == Darwin ]]; then MAX_MEMORY_UNITS=KB else MAX_MEMORY_UNITS=MB fi export TIMEFMT='%J %U user %S system %P cpu %*E total'$'\n'\ 'avg shared (code): %X KB'$'\n'\ 'avg unshared (data/stack): %D KB'$'\n'\ 'total (sum): %K KB'$'\n'\ 'max memory: %M '$MAX_MEMORY_UNITS''$'\n'\ 'page faults from disk: %F'$'\n'\ 'other page faults: %R' echo "JQ code bsearch" time /usr/bin/jq -n '[range(30000000)] | bsearch(3000)' echo "C code bsearch" time ./jq -n '[range(30000000)] | bsearch(3000)' ```` Results: ``` JQ code bsearch 3000 /usr/bin/jq -n '[range(30000000)] | bsearch(3000)' 8.63s user 0.77s system 98% cpu 9.542 total avg shared (code): 0 KB avg unshared (data/stack): 0 KB total (sum): 0 KB max memory: 823 MB page faults from disk: 1 other page faults: 432828 C code bsearch 3000 ./jq -n '[range(30000000)] | bsearch(3000)' 8.44s user 0.74s system 99% cpu 9.249 total avg shared (code): 0 KB avg unshared (data/stack): 0 KB total (sum): 0 KB max memory: 824 MB page faults from disk: 0 other page faults: 432766 ``` The results may be better if we can use jvp_array_read, and there is no need to copy/free the input array in each iteration. I guess that is like that for API pourposes when the libjq is in use with multiple threads in place. Signed-off-by: Eloy Coto --- src/builtin.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++- src/builtin.jq | 31 --------------------------- tests/jq.test | 4 ++++ 3 files changed, 60 insertions(+), 32 deletions(-) diff --git a/src/builtin.c b/src/builtin.c index 9aebd1f2d2..61acaaea1c 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -46,7 +46,6 @@ void *alloca (size_t); #include "jv_private.h" #include "util.h" - #define BINOP(name) \ static jv f_ ## name(jq_state *jq, jv input, jv a, jv b) { \ jv_free(input); \ @@ -780,6 +779,61 @@ static jv f_sort_by_impl(jq_state *jq, jv input, jv keys) { } } +/* Assuming the input array is sorted, bsearch/1 returns */ +/* the index of the target if the target is in the input array; and otherwise */ +/* (-1 - ix), where ix is the insertion point that would leave the array sorted. */ +/* If the input is not sorted, bsearch will terminate but with irrelevant results. */ +static jv f_bsearch(jq_state *jq, jv input, jv target) { + assert(jv_get_kind(input) == JV_KIND_ARRAY); + int len = jv_array_length(jv_copy(input)); + if (len == 0) { + jv_free(input); + jv_free(target); + return jv_number(-1); + } else if (len == 1) { + int result = jv_cmp(target, jv_array_get(input, 0)); + if (result == 0 ) { + return jv_number(0); + } else if (result > 0) { + return jv_number(-2); + } else { + return jv_number(-1); + } + } + + int start = 0; + int end = len - 1; + jv answer = jv_null(); + while (start .[1] ; - if .[2] != null then (.[1] = -1) # i.e. break - else - ( ( (.[1] + .[0]) / 2 ) | floor ) as $mid - | $in[$mid] as $monkey - | if $monkey == $target then (.[2] = $mid) # success - elif .[0] == .[1] then (.[1] = -1) # failure - elif $monkey < $target then (.[0] = ($mid + 1)) - else (.[1] = ($mid - 1)) - end - end ) - | if .[2] == null then # compute the insertion point - if $in[ .[0] ] < $target then (-2 -.[0]) - else (-1 -.[0]) - end - else .[2] - end - end; - # Apply f to composite entities recursively, and to atoms def walk(f): def w: diff --git a/tests/jq.test b/tests/jq.test index b94f29d245..18e1fdb401 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -1553,6 +1553,10 @@ bsearch(0,2,4) 1 -4 +bsearch({x:1}) +[{ "x": 0 },{ "x": 1 },{ "x": 2 }] +1 + # strptime tests are in optional.test strftime("%Y-%m-%dT%H:%M:%SZ") From 097fc631e3bfc7f1d851f6b8e80bc4cf8a59e46c Mon Sep 17 00:00:00 2001 From: Eloy Coto Date: Mon, 5 Feb 2024 10:47:20 +0100 Subject: [PATCH 2/3] style: fixes from suggestions. Co-authored-by: itchyny Signed-off-by: Eloy Coto --- src/builtin.c | 68 +++++++++++++++++---------------------------------- tests/jq.test | 4 ++- 2 files changed, 25 insertions(+), 47 deletions(-) diff --git a/src/builtin.c b/src/builtin.c index 61acaaea1c..b354679455 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -784,54 +784,30 @@ static jv f_sort_by_impl(jq_state *jq, jv input, jv keys) { /* (-1 - ix), where ix is the insertion point that would leave the array sorted. */ /* If the input is not sorted, bsearch will terminate but with irrelevant results. */ static jv f_bsearch(jq_state *jq, jv input, jv target) { - assert(jv_get_kind(input) == JV_KIND_ARRAY); - int len = jv_array_length(jv_copy(input)); - if (len == 0) { - jv_free(input); - jv_free(target); - return jv_number(-1); - } else if (len == 1) { - int result = jv_cmp(target, jv_array_get(input, 0)); - if (result == 0 ) { - return jv_number(0); - } else if (result > 0) { - return jv_number(-2); - } else { - return jv_number(-1); - } - } - - int start = 0; - int end = len - 1; - jv answer = jv_null(); - while (start Date: Thu, 29 Feb 2024 09:44:57 +0100 Subject: [PATCH 3/3] fix: bsearch prevent overflow on mid calculation Signed-off-by: Eloy Coto --- src/builtin.c | 4 +++- tests/jq.test | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/builtin.c b/src/builtin.c index b354679455..204d3f6feb 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -46,6 +46,7 @@ void *alloca (size_t); #include "jv_private.h" #include "util.h" + #define BINOP(name) \ static jv f_ ## name(jq_state *jq, jv input, jv a, jv b) { \ jv_free(input); \ @@ -785,13 +786,14 @@ static jv f_sort_by_impl(jq_state *jq, jv input, jv keys) { /* If the input is not sorted, bsearch will terminate but with irrelevant results. */ static jv f_bsearch(jq_state *jq, jv input, jv target) { if (jv_get_kind(input) != JV_KIND_ARRAY) { + jv_free(target); return type_error(input, "cannot be searched from"); } int start = 0; int end = jv_array_length(jv_copy(input)); jv answer = jv_invalid(); while (start < end) { - int mid = (start + end) / 2; + int mid = start + (end - start) / 2; int result = jv_cmp(jv_copy(target), jv_array_get(jv_copy(input), mid)); if (result == 0) { answer = jv_number(mid); diff --git a/tests/jq.test b/tests/jq.test index dd8d5b94b6..8991e27d3a 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -1559,6 +1559,10 @@ bsearch({x:1}) [{ "x": 0 },{ "x": 1 },{ "x": 2 }] 1 +try ["OK", bsearch(0)] catch ["KO",.] +"aa" +["KO","string (\"aa\") cannot be searched from"] + # strptime tests are in optional.test strftime("%Y-%m-%dT%H:%M:%SZ")