Skip to content

Commit

Permalink
various bugfixes and improvments
Browse files Browse the repository at this point in the history
1. only generate tokens for valid utf8
1. fix the count in shard.ReadState.Total
1. properly order dupi.Query.Next instead
of only round-robin.
  • Loading branch information
scott-cotton committed Sep 21, 2021
1 parent b36e8eb commit 53be587
Show file tree
Hide file tree
Showing 10 changed files with 54 additions and 17 deletions.
3 changes: 2 additions & 1 deletion blotter/circ.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package blotter

import (
"bytes"
"hash"
"hash/fnv"
)
Expand All @@ -41,7 +42,7 @@ func (c *Circ) Interleaving() int {
func (c *Circ) Blot(word []byte) uint32 {
fn := c.fn
fn.Reset()
fn.Write(word)
fn.Write(bytes.ToLower(word))
h := fn.Sum32()
c.hash ^= c.hashes[c.i]
c.hashes[c.i] = h
Expand Down
3 changes: 3 additions & 0 deletions cmd/dupi/extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ func (x *extractCmd) Run(args []string) error {
for {
n, err := query.Next(shape)
if err == io.EOF {
if n != 0 {
panic(fmt.Sprintf("next gave EOF but n=%d\n", n))
}
return nil
}
if err != nil {
Expand Down
29 changes: 23 additions & 6 deletions docs/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,28 @@ dupi index .

## Extracting Duplicates

Dupi extracts sets of documents which share a blot with the 'extract' verb.

```
dupi extract
```

These results are fast but noisy due to blot collisions. Extraction
skips any blots which are associated with one or fewer documents.

Some options may be of interest

```
-b output only blots, one per line
-json output json
-sigma float output only those blots with mean + sigma documents
By default, sigma is 3.0, it represents the standard deviation of
the number of documents associated with a blot. A higher value
outputs less information which is more likely to be associated with
actual duplicate text. A lower value is more thorough (has higher
recall) but less precision.
```

## Appending to the index

```
Expand Down Expand Up @@ -114,13 +132,10 @@ rudimentary, but here are some examples.
dupi extract -b | xargs dupi unblot
```

Or
### Like

```
dupi blot file | xargs dupi unblot
```

Much nicer, however is the 'like' verb
Dupi provides a 'like' verb which permits finding documents that
are similar to a given one which is not in the index.

```
dupi like file
Expand All @@ -137,3 +152,5 @@ documentation using [issues](https://github.com/go-air/dupi/issues) or





1 change: 0 additions & 1 deletion fnames.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ func (s *fnames) abs(v uint32) string {
func (s *fnames) addPath(path string) (uint32, error) {
abs, err := filepath.Abs(path)
if err != nil {
fmt.Printf("abs\n")
return 0, err
}
var (
Expand Down
4 changes: 1 addition & 3 deletions index.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,9 +283,7 @@ func (x *Index) qstate(s QueryStrategy) *qstate {
shard := &x.shards[i]
qstate.shardStates[i] = shard.ReadStateAt(0)
}
qstate.blot = uint32(qstate.shardStates[qstate.i].Blot)
qstate.blot *= uint32(qstate.n)
qstate.blot += qstate.i
qstate.setMax()
return qstate
}

Expand Down
2 changes: 1 addition & 1 deletion internal/shard/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func (x *Index) ReadStateForBlotAt(blot, at uint16) *ReadState {
res.Shard = x.id
res.Blot = blot
res.At = at
res.Total = x.counts[at]
res.Total = x.counts[blot]
res.rdr = x.postFile
return res
}
Expand Down
3 changes: 0 additions & 3 deletions internal/shard/read_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,5 @@ type ReadState struct {
func (s *ReadState) Next() (uint32, error) {
var docid uint32
docid, s.Error = s.Posts.next(s.rdr)
if s.Error != nil {
s.Total++
}
return docid, s.Error
}
3 changes: 3 additions & 0 deletions lock/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ func New(path string) (*File, error) {
return &File{path, f}, nil
}

// Close unlocks and then closes the file, returning any
// error. The file handle is closed whether or not
// unlocking fails with an error.
func (f *File) Close() error {
erru := f.Unlock()
errc := f.handle.Close()
Expand Down
19 changes: 17 additions & 2 deletions query.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,23 @@ type qstate struct {
shardStates []*shard.ReadState
i uint32
n uint32
blot uint32
nilCount uint32
}

func (s *qstate) setMax() {
var maxTotal, p uint32
for i, ss := range s.shardStates {
if ss == nil {
continue
}
if ss.Total > maxTotal {
maxTotal = ss.Total
p = uint32(i)
}
}
s.i = p
}

var ErrInvalidQueryState = errors.New("query state invalid")

type QueryStrategy int
Expand Down Expand Up @@ -144,10 +157,12 @@ func (q *Query) advance(src *shard.ReadState, pos uint32) *shard.ReadState {
if src.At == math.MaxUint16 {

} else if src.Total <= 1 {
//fmt.Printf("read state at %d has %d, exhausted\n", pos, src.Total)

} else {
rs = q.index.shards[pos].ReadStateAt(src.At + 1)
}
q.state.shardStates[pos] = rs
return rs
q.state.setMax()
return q.state.shardStates[q.state.i]
}
4 changes: 4 additions & 0 deletions token/t.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package token
import (
"fmt"
"unicode"
"unicode/utf8"
)

// Tag represents a value in an enumeration of
Expand Down Expand Up @@ -56,6 +57,9 @@ func (t *T) String() string {

// Tokenize is a tokenizer function.
func Tokenize(dst []T, d []byte, offset uint32) []T {
if !utf8.Valid(d) {
return dst
}
inWord := false
var i, j int
var r rune
Expand Down

0 comments on commit 53be587

Please sign in to comment.