various bugfixes and improvments

1. only generate tokens for valid utf8 1. fix the count in shard.ReadState.Total 1. properly order dupi.Query.Next instead of only round-robin.
go-air · Sep 21, 2021 · 53be587 · 53be587
1 parent b36e8eb
commit 53be587
Show file tree

Hide file tree

Showing 10 changed files with 54 additions and 17 deletions.
diff --git a/blotter/circ.go b/blotter/circ.go
@@ -15,6 +15,7 @@
 package blotter
 
 import (
+	"bytes"
 	"hash"
 	"hash/fnv"
 )
@@ -41,7 +42,7 @@ func (c *Circ) Interleaving() int {
 func (c *Circ) Blot(word []byte) uint32 {
 	fn := c.fn
 	fn.Reset()
-	fn.Write(word)
+	fn.Write(bytes.ToLower(word))
 	h := fn.Sum32()
 	c.hash ^= c.hashes[c.i]
 	c.hashes[c.i] = h

diff --git a/cmd/dupi/extract.go b/cmd/dupi/extract.go
@@ -69,6 +69,9 @@ func (x *extractCmd) Run(args []string) error {
 	for {
 		n, err := query.Next(shape)
 		if err == io.EOF {
+			if n != 0 {
+				panic(fmt.Sprintf("next gave EOF but n=%d\n", n))
+			}
 			return nil
 		}
 		if err != nil {

diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -83,10 +83,28 @@ dupi index .
 
 ## Extracting Duplicates
 
+Dupi extracts sets of documents which share a blot with the 'extract' verb.
+
 ```
 dupi extract
 ```
 
+These results are fast but noisy due to blot collisions.  Extraction
+skips any blots which are associated with one or fewer documents.
+
+Some options may be of interest
+
+```
+-b output only blots, one per line
+-json output json
+-sigma float output only those blots with mean + sigma documents
+By default, sigma is 3.0, it represents the standard deviation of
+the number of documents associated with a blot.  A higher value
+outputs less information which is more likely to be associated with
+actual duplicate text.  A lower value is more thorough (has higher
+recall) but less precision.
+```
+
 ## Appending to the index
 
 ```
@@ -114,13 +132,10 @@ rudimentary, but here are some examples.
 dupi extract -b | xargs dupi unblot
 ```
 
-Or 
+### Like
 
-```
-dupi blot file | xargs dupi unblot
-```
-
-Much nicer, however is the 'like' verb
+Dupi provides a 'like' verb which permits finding documents that
+are similar to a given one which is not in the index.
 
 ```
 dupi like file
@@ -137,3 +152,5 @@ documentation using [issues](https://github.com/go-air/dupi/issues) or
 
 
 
+
+
diff --git a/fnames.go b/fnames.go
@@ -70,7 +70,6 @@ func (s *fnames) abs(v uint32) string {
 func (s *fnames) addPath(path string) (uint32, error) {
 	abs, err := filepath.Abs(path)
 	if err != nil {
-		fmt.Printf("abs\n")
 		return 0, err
 	}
 	var (

diff --git a/index.go b/index.go
@@ -283,9 +283,7 @@ func (x *Index) qstate(s QueryStrategy) *qstate {
 		shard := &x.shards[i]
 		qstate.shardStates[i] = shard.ReadStateAt(0)
 	}
-	qstate.blot = uint32(qstate.shardStates[qstate.i].Blot)
-	qstate.blot *= uint32(qstate.n)
-	qstate.blot += qstate.i
+	qstate.setMax()
 	return qstate
 }
 

diff --git a/internal/shard/index.go b/internal/shard/index.go
@@ -73,7 +73,7 @@ func (x *Index) ReadStateForBlotAt(blot, at uint16) *ReadState {
 	res.Shard = x.id
 	res.Blot = blot
 	res.At = at
-	res.Total = x.counts[at]
+	res.Total = x.counts[blot]
 	res.rdr = x.postFile
 	return res
 }

diff --git a/internal/shard/read_state.go b/internal/shard/read_state.go
@@ -29,8 +29,5 @@ type ReadState struct {
 func (s *ReadState) Next() (uint32, error) {
 	var docid uint32
 	docid, s.Error = s.Posts.next(s.rdr)
-	if s.Error != nil {
-		s.Total++
-	}
 	return docid, s.Error
 }
diff --git a/lock/file.go b/lock/file.go
@@ -29,6 +29,9 @@ func New(path string) (*File, error) {
 	return &File{path, f}, nil
 }
 
+// Close unlocks and then closes the file, returning any
+// error.  The file handle is closed whether or not
+// unlocking fails with an error.
 func (f *File) Close() error {
 	erru := f.Unlock()
 	errc := f.handle.Close()

diff --git a/query.go b/query.go
@@ -27,10 +27,23 @@ type qstate struct {
 	shardStates []*shard.ReadState
 	i           uint32
 	n           uint32
-	blot        uint32
 	nilCount    uint32
 }
 
+func (s *qstate) setMax() {
+	var maxTotal, p uint32
+	for i, ss := range s.shardStates {
+		if ss == nil {
+			continue
+		}
+		if ss.Total > maxTotal {
+			maxTotal = ss.Total
+			p = uint32(i)
+		}
+	}
+	s.i = p
+}
+
 var ErrInvalidQueryState = errors.New("query state invalid")
 
 type QueryStrategy int
@@ -144,10 +157,12 @@ func (q *Query) advance(src *shard.ReadState, pos uint32) *shard.ReadState {
 	if src.At == math.MaxUint16 {
 
 	} else if src.Total <= 1 {
+		//fmt.Printf("read state at %d has %d, exhausted\n", pos, src.Total)
 
 	} else {
 		rs = q.index.shards[pos].ReadStateAt(src.At + 1)
 	}
 	q.state.shardStates[pos] = rs
-	return rs
+	q.state.setMax()
+	return q.state.shardStates[q.state.i]
 }
diff --git a/token/t.go b/token/t.go
@@ -18,6 +18,7 @@ package token
 import (
 	"fmt"
 	"unicode"
+	"unicode/utf8"
 )
 
 // Tag represents a value in an enumeration of
@@ -56,6 +57,9 @@ func (t *T) String() string {
 
 // Tokenize is a tokenizer function.
 func Tokenize(dst []T, d []byte, offset uint32) []T {
+	if !utf8.Valid(d) {
+		return dst
+	}
 	inWord := false
 	var i, j int
 	var r rune