From 0c262a25a98ee638f97af5341f758d9d859eb882 Mon Sep 17 00:00:00 2001 From: Andrey Kotlarski Date: Tue, 28 Aug 2018 03:43:30 +0300 Subject: [PATCH] Perform operations over raw bytes where possible. --- LICENSE | 28 +++++++++ README.org | 40 +++++++++++++ fdbq.asd | 5 +- fdbq.lisp | 156 +++++++++++++++++------------------------------ filter.lisp | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++ spec.lisp | 22 +++---- 6 files changed, 305 insertions(+), 116 deletions(-) create mode 100644 LICENSE create mode 100644 filter.lisp diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9412ae0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,28 @@ +Copyright 2018 Andrey Kotlarski + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.org b/README.org index 422e1c1..528688a 100644 --- a/README.org +++ b/README.org @@ -1,3 +1,43 @@ * SQL-like operations over fixed field DBs See [[https://m00natic.github.io/lisp/manual-jit.html][Uniform Structured Syntax, Metaprogramming and Run-time Compilation]] + +This is a Common Lisp library which is to enable subset of SQL-like +operations over DBs with fields/columns with fixed number of ASCII +bytes. Thus far it works on files in such format. + +** Declare DBs + +The defspec macro is to be used to register a DB with respective +schema. Here's how the article example would look like: + +#+BEGIN_SRC lisp + (defspec (recordS5 ;db name, letter case doesn't matter + :file ;db type, currently only :file is supported + "/path/to/records5.db") ;filename + ;; declare fields in order, offsets are automatically assigned + 2 ;2 unnamed filler bytes + (cxr 2) ;named field with 2 bytes + (type 1) + (subcode 3) + (date_eff 6) + (date_disc 6) + 20 + (commercial_name 16)) +#+END_SRC + +** Operations + +Here's the article example: + +#+BEGIN_SRC lisp + (select (cxr subcode commercial_name date_disc) ;list of selected fields + recordS5 ;db name + :where (and (like cxr "YY|XX") ;where clause + (or (and (like COMMERCIAL_NAME "PET") + (= type "C")) + (and (= type "F") + (= commercial_name "MEAL"))) + (< "180620" date_disc) + (< date_eff date_disc))) +#+END_SRC diff --git a/fdbq.asd b/fdbq.asd index e343b3c..931e544 100644 --- a/fdbq.asd +++ b/fdbq.asd @@ -4,9 +4,10 @@ :description "SQL-like querying over fixed-field DBs." :author "Andrey Kotlarski " :license "BSD-3" - :version "0.0.1" - :depends-on (:cl-ppcre #:cl-string-match) + :version "0.0.2" + :depends-on (:cl-ppcre #:ascii-strings) :serial t :components ((:file "package") (:file "spec") + (:file "filter") (:file "fdbq"))) diff --git a/fdbq.lisp b/fdbq.lisp index 6cba1da..b467d5f 100644 --- a/fdbq.lisp +++ b/fdbq.lisp @@ -6,125 +6,79 @@ "Max chunk size to preload in bytes.") (defmacro select (field-list db &key where) - "Generate selection procedure and run it." + "Select FIELD-LIST from DB with WHERE filter." (let ((spec (get-spec db))) ;pull out the specification for this db `(funcall (compile nil (lambda () ;guarantee execution of a compiled object (declare (optimize (speed 3) (debug 0) (safety 0) (compilation-speed 0))) - (do-lines (line ,spec) ;bind line to db entries + (do-lines (line ,spec buffer offset) ;bind buffer/line to db entries ;; if where is empty, condition is considered always satisfied - (when ,(or (gen-where where 'line spec) t) - ,(gen-print-selection field-list 'line spec)))))))) + (when ,(or (gen-where where 'line spec 'buffer 'offset) t) + ,(gen-print-selection field-list 'line spec + 'buffer 'offset)))))))) -(defmacro do-lines ((line-var spec) &body body) - "Bind LINE-VAR to each line in SPEC specified file and execute BODY." +(defmacro do-lines ((line-var spec &optional buffer-var offset-var) &body body) + "Bind LINE-VAR to each line in SPEC specified source and execute BODY. +If BUFFER-VAR for byte array and OFFSET-VAR for offset within it are supplied, +expose them to BODY as well." + (when (eq :file (spec-type spec)) + `(do-file-lines (,line-var ,spec ,buffer-var ,offset-var) ,@body))) + +(defmacro do-file-lines ((line-var spec &optional buffer-var offset-var) &body body) + "Bind LINE-VAR to each line in SPEC specified file and execute BODY. +If BUFFER-VAR for byte array and OFFSET-VAR for offset within it are supplied, +expose them to BODY as well." (let* ((entry-size (spec-size spec)) ;entry size is known (line-size (1+ entry-size)) ;add 1 for newline (buffer-size (* line-size (floor +max-buffer-size+ line-size))) (ins (gensym)) ;make sure the file stream variable is not visible to the body - (buffer (gensym)) ;-//- for read buffer - (bytes (gensym)) ;-//- for number of bytes read - (offset (gensym))) ;-//- current line within buffer offset) - `(let ((,buffer (make-array ,buffer-size :element-type 'ascii:ub-char)) ;allocate read + (bytes (gensym)) ;-//- for number of bytes read + (use-only-line? (null buffer-var))) + (unless buffer-var + (setf buffer-var (gensym) + offset-var (gensym))) + `(let ((,buffer-var (make-array ,buffer-size :element-type 'ascii:ub-char)) ;allocate read (,line-var (make-string ,entry-size :element-type 'base-char))) ;and line buffers - (declare (type (simple-array ascii:ub-char (,buffer-size)) ,buffer) + (declare (type (simple-array ascii:ub-char (,buffer-size)) ,buffer-var) (type (simple-base-string ,entry-size) ,line-var) ;no need for newline - (dynamic-extent ,buffer ,line-var)) ;use stack allocation if possible + (dynamic-extent ,buffer-var ,line-var)) ;use stack allocation if possible (with-open-file (,ins ,(spec-path spec) :direction :input :element-type 'ascii:ub-char) - (loop for ,bytes fixnum = (read-sequence ,buffer ,ins) ;read as many lines + (loop for ,bytes fixnum = (read-sequence ,buffer-var ,ins) ;read as many lines until (zerop ,bytes) ;; slide offset through the buffer - do (loop for ,offset fixnum from 0 below (* ,line-size - (floor ,bytes ,line-size)) + do (loop for ,offset-var fixnum from 0 below (* ,line-size + (floor ,bytes ,line-size)) by ,line-size - ;; fill line with the current window bytes while converting them - do (loop for i fixnum from 0 below ,entry-size - for j fixnum from ,offset - do (setf (aref ,line-var i) (code-char (aref ,buffer j)))) - ,@body)))))) ;the body only sees the line variable as before - -(defun gen-where (where line-var spec) - "Create actual boolean tree for WHERE. - LINE-VAR is symbol representing the current line variable. - SPEC contains fields' offset and size information." - (when (consp where) - (let ((op (first where))) - (cond ((consp op) ;several expressions in a row, recurse - (cons (gen-where op line-var spec) - (gen-where (rest where) line-var spec))) - ((member op '(and or not)) ;intermediate node, recurse - (cons op (gen-where (rest where) line-var spec))) - ((member op '(= /= < <= > >= like)) ;leaf - (gen-field-op where line-var spec)) - (t (error (format nil "Bad where clause: ~A" where))))))) - -(defun translate-op (op &optional char?) - "Return string/char operation corresponding to OP." - (intern (concatenate 'string (if char? "CHAR" "STRING") - (symbol-name op)))) - -(defun simple-regex? (str) - "Check if string is not really a regex. -This a bit pessimistic." - (not (find-if-not #'alphanumericp str))) - -(defmethod gen-field-clause ((op (eql 'like)) field1 field2 line-var) - "Generate code for regex clause." - (cond ((null (field-filter field2)) ;regex is taken from a field - `(cl-ppcre:scan (subseq ,line-var ,(db-field-offset field2) - :end ,(+ (db-field-offset field2) - (db-field-size field2))) - ,line-var :start ,(db-field-offset field1) - :end ,(+ (db-field-offset field1) - (db-field-size field1)))) - ((simple-regex? (field-filter field2)) ;use plain search instead of regex - `(search ,(field-filter field2) ,line-var - :start1 ,(db-field-offset field2) - :end1 ,(+ (db-field-offset field2) - (db-field-size field2)) - :start2 ,(db-field-offset field1) - :end2 ,(+ (db-field-offset field1) - (db-field-size field1)))) - (t `(cl-ppcre:scan ,(field-filter field2) - ,line-var :start ,(db-field-offset field1) - :end ,(+ (db-field-offset field1) - (db-field-size field1)))))) - -(defmethod gen-field-clause (op field1 field2 line-var) - "Generate code for a comparison clause." - (let ((size (min (db-field-size field1) - (db-field-size field2)))) - (if (= 1 size) ;optimize single character comparison - (list (translate-op op t) - (if (field-filter field1) ;string literal? - (aref (field-filter field1) 0) - `(aref ,line-var ,(db-field-offset field1))) - (if (field-filter field2) - (aref (field-filter field2) 0) - `(aref ,line-var ,(db-field-offset field2)))) - (list (translate-op op) - (or (field-filter field1) line-var) - (or (field-filter field2) line-var) - :start1 (db-field-offset field1) - :end1 (+ (db-field-offset field1) size) - :start2 (db-field-offset field2) - :end2 (+ (db-field-offset field2) size))))) - -(defun gen-field-op (clause line-var spec) - "Generate code for a leaf WHERE clause." - (destructuring-bind (op field1 field2) clause - (gen-field-clause op (get-operand-traits field1 spec) - (get-operand-traits field2 spec) line-var))) + do (progn + ,(when use-only-line? + `(loop for i fixnum from 0 below ,entry-size + for j fixnum from ,offset-var + do (setf (aref ,line-var i) + (code-char (aref ,buffer-var j))))) + ,@body))))))) -(defun gen-print-selection (fields line-var spec) +(defun gen-print-selection (fields line-var spec + &optional buffer-var offset-var) "Unroll selected FIELDS' print statements. - LINE-VAR is symbol representing the current line variable. - SPEC holds field offset details." +LINE-VAR is symbol representing the current line variable. +SPEC holds field offset details. +BUFFER-VAR is symbol representing the db buffer. +OFFSET-VAR is symbol representing the current offset in the db buffer." `(progn - ,@(loop for field in fields ;collect print statements in list and splice them - collect `(write-string ,line-var nil - :start ,(field-offset field spec) - :end ,(+ (field-offset field spec) ;constant fold - (field-size field spec)))) + (format t "|") + ,@(if buffer-var + (loop for field in fields ;collect print statements in list and splice them + collect `(loop for i fixnum from (+ ,offset-var + ,(field-offset field spec)) + below (+ ,offset-var ,(+ (field-offset field spec) + (field-size field spec))) + do (write-char (code-char (aref ,buffer-var i)))) + collect '(format t "|")) + (loop for field in fields ;collect print statements in list and splice them + collect `(write-string ,line-var nil + :start ,(field-offset field spec) + :end ,(+ (field-offset field spec) ;constant fold + (field-size field spec))) + collect '(format t "|"))) (format t "~%"))) diff --git a/filter.lisp b/filter.lisp new file mode 100644 index 0000000..5d43f8c --- /dev/null +++ b/filter.lisp @@ -0,0 +1,170 @@ +;;;; filter.lisp + +(in-package #:fdbq) + +(defun gen-where (where line-var spec &optional buffer-var offset-var) + "Create actual boolean tree for WHERE. +LINE-VAR is symbol representing the current line variable. +SPEC contains fields' offset and size information. +BUFFER-VAR is symbol representing the db buffer. +OFFSET-VAR is symbol representing the current offset in the db buffer. +If buffer and offset are nil, use operations only over the string line. +LINE-VAR, BUFFER-VAR and OFFSET-VAR are dynamic." + (declare (special line-var buffer-var offset-var)) + (when (consp where) + (let ((op (first where))) + (cond ((consp op) ;several expressions in a row, recurse + (cons (gen-where op line-var spec buffer-var offset-var) + (gen-where (rest where) line-var spec buffer-var offset-var))) + ((member op '(and or not)) ;intermediate node, recurse + (cons op (gen-where (rest where) line-var spec buffer-var offset-var))) + ((member op '(= /= < <= > >= like)) ;leaf + (gen-field-op where spec)) + (t (error (format nil "Bad where clause: ~A" where))))))) + +(defun gen-field-op (clause spec) + "Generate code for a leaf WHERE clause." + (declare (special buffer-var)) + (destructuring-bind (op field1 field2) clause + (if buffer-var + (gen-field-clause-raw op (get-operand-traits field1 spec) + (get-operand-traits field2 spec)) + (gen-field-clause op (get-operand-traits field1 spec) + (get-operand-traits field2 spec))))) + +(defun translate-op (op &optional char?) + (declare (special buffer-var)) + "Return string/char operation corresponding to OP." + (if buffer-var + (intern (concatenate 'string + (if char? "UB-CHAR" "UB-STRING") + (symbol-name op)) :ascii) + (intern (concatenate 'string + (if char? "CHAR" "STRING") + (symbol-name op))))) + +(defun simple-regex? (str) + "Check if string is not really a regex. +This a bit pessimistic." + (not (find-if-not #'alphanumericp str))) + +(defgeneric gen-field-clause (op field1 field2) + (:documentation "Generate code for operation with 2 operands over string line.")) + +(defgeneric gen-field-clause-raw (op field1 field2) + (:documentation "Generate code for operation with 2 operands over byte buffer with offset.")) + +(defmethod gen-field-clause ((op (eql 'like)) field1 field2) + "Generate code for regex clause over string line variable." + (declare (special line-var)) + (cond ((null (literal-filter field2)) ;regex is taken from a field + `(cl-ppcre:scan (subseq ,line-var ,(db-field-offset field2) + :end ,(+ (db-field-offset field2) + (db-field-size field2))) + ,line-var :start ,(db-field-offset field1) + :end ,(+ (db-field-offset field1) + (db-field-size field1)))) + ((simple-regex? (literal-filter field2)) ;use plain search instead of regex + `(search ,(literal-filter field2) ,line-var + :start1 ,(db-field-offset field2) + :end1 ,(+ (db-field-offset field2) + (db-field-size field2)) + :start2 ,(db-field-offset field1) + :end2 ,(+ (db-field-offset field1) + (db-field-size field1)))) + (t `(cl-ppcre:scan ,(literal-filter field2) + ,line-var :start ,(db-field-offset field1) + :end ,(+ (db-field-offset field1) + (db-field-size field1)))))) + +(defmethod gen-field-clause (op field1 field2) + "Generate code for a comparison clause over string line variable." + (declare (special line-var)) + (let ((size (min (db-field-size field1) + (db-field-size field2)))) + (if (= 1 size) ;optimize single character comparison + (list (translate-op op t) + (if (literal-filter field1) ;string literal? + (aref (literal-filter field1) 0) + `(aref ,line-var ,(db-field-offset field1))) + (if (literal-filter field2) + (aref (literal-filter field2) 0) + `(aref ,line-var ,(db-field-offset field2)))) + (list (translate-op op) + (or (literal-filter field1) line-var) + (or (literal-filter field2) line-var) + :start1 (db-field-offset field1) + :end1 (+ (db-field-offset field1) size) + :start2 (db-field-offset field2) + :end2 (+ (db-field-offset field2) size))))) + +(defmethod gen-field-clause-raw ((op (eql 'like)) field1 field2) + "Generate code for regex clause over raw byte buffer." + (declare (special line-var buffer-var offset-var)) + (cond ((null (literal-filter field2)) ;regex is taken from a field + `(progn + (loop for i fixnum from ,(db-field-offset field1) + below ,(+ (db-field-offset field1) + (db-field-size field1)) + for j fixnum from (+ ,offset-var ,(db-field-offset field1)) + do (setf (aref ,line-var i) (code-char (aref ,buffer-var j)))) + (loop for i fixnum from ,(db-field-offset field2) + below ,(+ (db-field-offset field2) + (db-field-size field2)) + for j fixnum from (+ ,offset-var ,(db-field-offset field2)) + do (setf (aref ,line-var i) (code-char (aref ,buffer-var j)))) + (cl-ppcre:scan (subseq ,line-var ,(db-field-offset field2) + :end ,(+ (db-field-offset field2) + (db-field-size field2))) + ,line-var :start ,(db-field-offset field1) + :end ,(+ (db-field-offset field1) + (db-field-size field1))))) + ((simple-regex? (literal-filter field2)) ;use plain search instead of regex + `(search ,(ascii:string-to-ub (literal-filter field2)) ,buffer-var + :start1 ,(db-field-offset field2) + :end1 ,(+ (db-field-offset field2) + (db-field-size field2)) + :start2 (+ ,offset-var ,(db-field-offset field1)) + :end2 (+ ,offset-var ,(+ (db-field-offset field1) + (db-field-size field1))))) + (t `(progn + (loop for i fixnum from ,(db-field-offset field1) + below ,(+ (db-field-offset field1) + (db-field-size field1)) + for j fixnum from (+ ,offset-var ,(db-field-offset field1)) + do (setf (aref ,line-var i) (code-char (aref ,buffer-var j)))) + (cl-ppcre:scan ,(literal-filter field2) + ,line-var :start ,(db-field-offset field1) + :end ,(+ (db-field-offset field1) + (db-field-size field1))))))) + +(defmethod gen-field-clause-raw (op field1 field2) + "Generate code for a comparison clause over raw byte buffer." + (declare (special buffer-var offset-var)) + (let ((size (min (db-field-size field1) + (db-field-size field2)))) + (if (= 1 size) ;optimize single character comparison + (list (translate-op op t) + (if (literal-filter field1) ;string literal? + (char-code (aref (literal-filter field1) 0)) + `(aref ,buffer-var (+ ,offset-var ,(db-field-offset field1)))) + (if (literal-filter field2) + (char-code (aref (literal-filter field2) 0)) + `(aref ,buffer-var (+ ,offset-var ,(db-field-offset field2))))) + `(,(translate-op op) + ,(if (literal-filter field1) + (ascii:string-to-ub (literal-filter field1)) + buffer-var) + ,(if (literal-filter field2) + (ascii:string-to-ub (literal-filter field2)) + buffer-var) + ,@(if (literal-filter field1) + (list :start1 (db-field-offset field1) + :end1 (+ (db-field-offset field1) size)) + `(:start1 (+ ,offset-var ,(db-field-offset field1)) + :end1 (+ ,offset-var ,(+ (db-field-offset field1) size)))) + ,@(if (literal-filter field2) + (list :start2 (db-field-offset field2) + :end2 (+ (db-field-offset field2) size)) + `(:start2 (+ ,offset-var ,(db-field-offset field2)) + :end2 (+ ,offset-var ,(+ (db-field-offset field2) size)))))))) diff --git a/spec.lisp b/spec.lisp index fae66e2..4d015dc 100644 --- a/spec.lisp +++ b/spec.lisp @@ -9,7 +9,7 @@ (size :accessor db-field-size :initarg :size))) (defclass operand-traits (db-field) - ((filter :accessor field-filter :initarg :filter :initform nil))) + ((filter :accessor literal-filter :initarg :filter :initform nil))) (defstruct spec (size 0 :type fixnum) @@ -27,7 +27,11 @@ (db-field-size (gethash field (spec-fields spec)))) (defmacro defspec ((name type path) &rest fields) - `(let* ((spec (make-spec :type ,type :path ,path)) + "Register db with NAME, TYPE (only :file supported for now) and file PATH. +FIELDS is a list of ordered column declarations. Field declaration may be +a list with 2 elements - field name and byte count. Or may be just a number +signifying unnamed number of filler bytes." + `(let* ((spec (make-spec :type ,type :path (merge-pathnames ,path))) (spec-fields (spec-fields spec)) (size 0)) (dolist (field-spec ',fields) @@ -40,6 +44,9 @@ (setf (spec-size spec) size (gethash ',name *dbs*) spec))) +(defgeneric get-operand-traits (operand spec) + (:documentation "Determine OPERAND dimensions according to SPEC.")) + (defmethod get-operand-traits ((operand string) spec) "Get dimensions of literal string OPERAND." (declare (ignore spec)) @@ -52,14 +59,3 @@ (let ((field (gethash operand (spec-fields spec)))) (make-instance 'operand-traits :offset (db-field-offset field) :size (db-field-size field)))) - -#+or -(defspec (recordS5x :file #P"/path/to/records5.db") - 2 - (cxr 2) - (type 1) - (subcode 3) - (date_eff 6) - (date_disc 6) - 20 - (commercial_name 16))