Optimize memory usage

Add a basic map storage that does manual searches to conserve memory
Change saved hashes format to allow multiple hashes for a given ID
Add a vptree storage

Maps in Go take up a huge amount of space changing IDList to []ID took
  memory from over 1GB down to 200MB (note this was on aarch64 MacOS
  which for some reason uses less memory than aarch64 Linux).
  Exhaustive searches using slices took about 30 ms search now takes
  50-60 ms as it takes longer to iterate a map. Partial hashes will
  speed up searches to 8 ms at the cost of 700MB initial memory usage
  and 400MB idle (though this is on MacOS, which for some reason uses
  less memory that aarch64 Linux so probably more like
  900MB initial -> 600 MB idle on an RPI running Linux)
This commit is contained in:
Timmy Welch 2024-09-07 14:51:18 -07:00
parent b1de95021a
commit 0928ed6ccf
11 changed files with 581 additions and 301 deletions

View File

@ -13,7 +13,7 @@ repos:
- id: go-imports - id: go-imports
args: [-w] args: [-w]
- repo: https://github.com/golangci/golangci-lint - repo: https://github.com/golangci/golangci-lint
rev: v1.59.1 rev: v1.60.3
hooks: hooks:
- id: golangci-lint - id: golangci-lint
- repo: https://github.com/asottile/setup-cfg-fmt - repo: https://github.com/asottile/setup-cfg-fmt

151
BasicMap.go Normal file
View File

@ -0,0 +1,151 @@
package ch
import (
"fmt"
"math/bits"
"sync"
"gitea.narnian.us/lordwelch/goimagehash"
)
type basicMapStorage struct {
hashMutex sync.RWMutex
ids map[ID]*[]ID
hashes [3]map[uint64]*[]ID
}
func (b *basicMapStorage) Atleast(hashKind goimagehash.Kind, maxDistance int, searchHash uint64) []Result {
hashType := int(hashKind) - 1
matchingHashes := make([]Result, 0, 100) // hope that we don't need all of them
for storedHash, ids := range b.hashes[hashType] {
distance := bits.OnesCount64(searchHash ^ storedHash)
if distance <= maxDistance {
matchingHashes = append(matchingHashes, Result{ToIDList(*ids), distance, Hash{storedHash, hashKind}})
}
}
return matchingHashes
}
func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
var foundMatches []Result
b.hashMutex.RLock()
defer b.hashMutex.RUnlock()
resetTime()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
for _, hash := range hashes {
hashType := int(hash.Kind) - 1
ids := b.hashes[hashType][hash.Hash]
if ids != nil && len(*ids) > 0 {
foundMatches = append(foundMatches, Result{
Distance: 0,
Hash: hash,
IDs: ToIDList(*ids),
})
}
}
// If we have exact matches don't bother with other matches
if len(foundMatches) > 0 && exactOnly {
return foundMatches, nil
}
logTime("Search Exact")
}
foundHashes := make(map[uint64]struct{})
totalPartialHashes := 0
for _, hash := range hashes {
for _, match := range b.Atleast(hash.Kind, max, hash.Hash) {
_, alreadyMatched := foundHashes[match.Hash.Hash]
if alreadyMatched {
continue
}
foundHashes[match.Hash.Hash] = struct{}{}
foundMatches = append(foundMatches, match)
}
}
fmt.Println("Total partial hashes tested:", totalPartialHashes, len(foundHashes))
logTime("Search Complete")
go b.printSizes()
return foundMatches, nil
}
func (b *basicMapStorage) MapHashes(hash ImageHash) {
for _, ih := range hash.Hashes {
var (
hashType = int(ih.Kind) - 1
)
*b.hashes[hashType][ih.Hash] = InsertID((*b.hashes[hashType][ih.Hash]), hash.ID)
}
}
func (b *basicMapStorage) DecodeHashes(hashes SavedHashes) error {
for hashType, sourceHashes := range hashes.Hashes {
b.hashes[hashType] = make(map[uint64]*[]ID, len(sourceHashes))
for savedHash, idlistLocation := range sourceHashes {
b.hashes[hashType][savedHash] = &hashes.IDs[idlistLocation]
}
}
b.printSizes()
return nil
}
func (b *basicMapStorage) printSizes() {
// fmt.Println("Size of", "hashes:", size.Of(b.hashes)/1024/1024, "MB")
// fmt.Println("Size of", "ids:", size.Of(b.ids)/1024/1024, "MB")
// fmt.Println("Size of", "basicMapStorage:", size.Of(b)/1024/1024, "MB")
}
func (b *basicMapStorage) EncodeHashes() (SavedHashes, error) {
hashes := SavedHashes{}
idmap := map[*[]ID]int{}
for _, ids := range b.ids {
if _, ok := idmap[ids]; ok {
continue
}
hashes.IDs = append(hashes.IDs, *ids)
idmap[ids] = len(hashes.IDs)
}
for hashType, hashToID := range b.hashes {
for hash, ids := range hashToID {
hashes.Hashes[hashType][hash] = idmap[ids]
}
}
return hashes, nil
}
func (b *basicMapStorage) AssociateIDs(newids []NewIDs) {
for _, newid := range newids {
ids, found := b.ids[newid.OldID]
if !found {
msg := "No IDs belonging to " + newid.OldID.Domain + "exist on this server"
panic(msg)
}
*ids = InsertID(*ids, newid.NewID)
}
}
func (b *basicMapStorage) GetIDs(id ID) IDList {
ids, found := b.ids[id]
if !found {
msg := "No IDs belonging to " + id.Domain + "exist on this server"
panic(msg)
}
return ToIDList(*ids)
}
func NewBasicMapStorage() (HashStorage, error) {
storage := &basicMapStorage{
hashMutex: sync.RWMutex{},
hashes: [3]map[uint64]*[]ID{
make(map[uint64]*[]ID),
make(map[uint64]*[]ID),
make(map[uint64]*[]ID),
},
}
return storage, nil
}

View File

@ -91,20 +91,26 @@ type Storage int
const ( const (
Map = iota + 1 Map = iota + 1
BasicMap
Sqlite Sqlite
Sqlite3 Sqlite3
VPTree
) )
var storageNames = map[Storage]string{ var storageNames = map[Storage]string{
Map: "map", Map: "map",
Sqlite: "sqlite", BasicMap: "basicmap",
Sqlite3: "sqlite3", Sqlite: "sqlite",
Sqlite3: "sqlite3",
VPTree: "vptree",
} }
var storageValues = map[string]Storage{ var storageValues = map[string]Storage{
"map": Map, "map": Map,
"sqlite": Sqlite, "basicmap": BasicMap,
"sqlite3": Sqlite3, "sqlite": Sqlite,
"sqlite3": Sqlite3,
"vptree": VPTree,
} }
func (f Storage) String() string { func (f Storage) String() string {
@ -138,7 +144,7 @@ type Opts struct {
} }
func main() { func main() {
opts := Opts{format: Msgpack, storageType: Map} // flag is weird opts := Opts{format: Msgpack, storageType: BasicMap} // flag is weird
go func() { go func() {
log.Println(http.ListenAndServe("localhost:6060", nil)) log.Println(http.ListenAndServe("localhost:6060", nil))
}() }()
@ -150,7 +156,7 @@ func main() {
flag.BoolVar(&opts.saveEmbeddedHashes, "save-embedded-hashes", false, "Save hashes even if we loaded the embedded hashes") flag.BoolVar(&opts.saveEmbeddedHashes, "save-embedded-hashes", false, "Save hashes even if we loaded the embedded hashes")
flag.StringVar(&opts.hashesPath, "hashes", "hashes.gz", "Path to optionally gziped hashes in msgpack or json format. You must disable embedded hashes to use this option") flag.StringVar(&opts.hashesPath, "hashes", "hashes.gz", "Path to optionally gziped hashes in msgpack or json format. You must disable embedded hashes to use this option")
flag.Var(&opts.format, "save-format", "Specify the format to export hashes to (json, msgpack)") flag.Var(&opts.format, "save-format", "Specify the format to export hashes to (json, msgpack)")
flag.Var(&opts.storageType, "storage-type", "Specify the storage type used internally to search hashes (sqlite,sqlite3,map)") flag.Var(&opts.storageType, "storage-type", "Specify the storage type used internally to search hashes (sqlite,sqlite3,map,basicmap,vptree)")
flag.Parse() flag.Parse()
if opts.coverPath != "" { if opts.coverPath != "" {
@ -350,6 +356,7 @@ func (s *Server) matchCoverHash(w http.ResponseWriter, r *http.Request) {
max int = 8 max int = 8
max_tmp int max_tmp int
err error err error
hashes []ch.Hash
) )
if ahash, err = strconv.ParseUint(ahashStr, 16, 64); err != nil && ahashStr != "" { if ahash, err = strconv.ParseUint(ahashStr, 16, 64); err != nil && ahashStr != "" {
@ -357,16 +364,25 @@ func (s *Server) matchCoverHash(w http.ResponseWriter, r *http.Request) {
writeJson(w, http.StatusBadRequest, result{Msg: "hash parse failed"}) writeJson(w, http.StatusBadRequest, result{Msg: "hash parse failed"})
return return
} }
if ahash > 0 {
hashes = append(hashes, ch.Hash{ahash, goimagehash.AHash})
}
if dhash, err = strconv.ParseUint(dhashStr, 16, 64); err != nil && dhashStr != "" { if dhash, err = strconv.ParseUint(dhashStr, 16, 64); err != nil && dhashStr != "" {
log.Printf("could not parse dhash: %s", dhashStr) log.Printf("could not parse dhash: %s", dhashStr)
writeJson(w, http.StatusBadRequest, result{Msg: "hash parse failed"}) writeJson(w, http.StatusBadRequest, result{Msg: "hash parse failed"})
return return
} }
if dhash > 0 {
hashes = append(hashes, ch.Hash{dhash, goimagehash.DHash})
}
if phash, err = strconv.ParseUint(phashStr, 16, 64); err != nil && phashStr != "" { if phash, err = strconv.ParseUint(phashStr, 16, 64); err != nil && phashStr != "" {
log.Printf("could not parse phash: %s", phashStr) log.Printf("could not parse phash: %s", phashStr)
writeJson(w, http.StatusBadRequest, result{Msg: "hash parse failed"}) writeJson(w, http.StatusBadRequest, result{Msg: "hash parse failed"})
return return
} }
if phash > 0 {
hashes = append(hashes, ch.Hash{phash, goimagehash.PHash})
}
if max_tmp, err = strconv.Atoi(maxStr); err != nil && maxStr != "" { if max_tmp, err = strconv.Atoi(maxStr); err != nil && maxStr != "" {
log.Printf("Invalid Max: %s", maxStr) log.Printf("Invalid Max: %s", maxStr)
writeJson(w, http.StatusBadRequest, result{Msg: fmt.Sprintf("Invalid Max: %s", maxStr)}) writeJson(w, http.StatusBadRequest, result{Msg: fmt.Sprintf("Invalid Max: %s", maxStr)})
@ -381,7 +397,10 @@ func (s *Server) matchCoverHash(w http.ResponseWriter, r *http.Request) {
writeJson(w, http.StatusBadRequest, result{Msg: fmt.Sprintf("Max must be less than 9: %d", max)}) writeJson(w, http.StatusBadRequest, result{Msg: fmt.Sprintf("Max must be less than 9: %d", max)})
return return
} }
matches, err := s.hashes.GetMatches([]ch.Hash{{ahash, goimagehash.AHash}, {dhash, goimagehash.DHash}, {phash, goimagehash.PHash}}, max, exactOnly) matches, err := s.hashes.GetMatches(hashes, max, exactOnly)
slices.SortFunc(matches, func(a ch.Result, b ch.Result) int {
return cmp.Compare(a.Distance, b.Distance)
})
log.Println(err) log.Println(err)
if len(matches) > 0 { if len(matches) > 0 {
var msg string = "" var msg string = ""
@ -532,10 +551,15 @@ func (s *Server) DecodeHashes(format Format, hashes []byte) error {
default: default:
return fmt.Errorf("Unknown format: %v", format) return fmt.Errorf("Unknown format: %v", format)
} }
loadedHashes := make(ch.SavedHashes) loadedHashes := ch.SavedHashes{}
err := decoder(hashes, &loadedHashes) err := decoder(hashes, &loadedHashes)
if err != nil { if err != nil || len(loadedHashes.IDs) == 0 {
return err fmt.Println("Failed to load hashes, checking if they are old hashes", err)
oldHashes := make(ch.OldSavedHashes)
if err = decoder(hashes, &oldHashes); err != nil {
return err
}
loadedHashes = ch.ConvertSavedHashes(oldHashes)
} }
return s.hashes.DecodeHashes(loadedHashes) return s.hashes.DecodeHashes(loadedHashes)
@ -597,10 +621,14 @@ func initializeStorage(opts Opts) (ch.HashStorage, error) {
switch opts.storageType { switch opts.storageType {
case Map: case Map:
return ch.NewMapStorage() return ch.NewMapStorage()
case BasicMap:
return ch.NewBasicMapStorage()
case Sqlite: case Sqlite:
return ch.NewSqliteStorage("sqlite", opts.sqlitePath) return ch.NewSqliteStorage("sqlite", opts.sqlitePath)
case Sqlite3: case Sqlite3:
return ch.NewSqliteStorage("sqlite3", opts.sqlitePath) return ch.NewSqliteStorage("sqlite3", opts.sqlitePath)
case VPTree:
return ch.NewVPStorage()
} }
return nil, errors.New("Unknown storage type provided") return nil, errors.New("Unknown storage type provided")
} }

17
cmd/comic-hasher/tmp.go Normal file
View File

@ -0,0 +1,17 @@
//go:build main
package main
import (
"fmt"
"time"
)
func main() {
tmp := make([]string, 0, 932456)
for range 932460 {
tmp = append(tmp, "comicvine.gamespot.com:123456")
}
fmt.Println(len(tmp))
time.Sleep(time.Minute)
}

10
go.mod
View File

@ -1,8 +1,6 @@
module gitea.narnian.us/lordwelch/comic-hasher module gitea.narnian.us/lordwelch/comic-hasher
go 1.22.1 go 1.23.0
toolchain go1.22.2
require ( require (
gitea.narnian.us/lordwelch/goimagehash v0.0.0-20240812025715-33ff96e45f00 gitea.narnian.us/lordwelch/goimagehash v0.0.0-20240812025715-33ff96e45f00
@ -10,8 +8,10 @@ require (
github.com/kr/pretty v0.1.0 github.com/kr/pretty v0.1.0
github.com/mattn/go-sqlite3 v1.14.22 github.com/mattn/go-sqlite3 v1.14.22
github.com/mholt/archiver/v4 v4.0.0-alpha.8 github.com/mholt/archiver/v4 v4.0.0-alpha.8
github.com/ncruces/go-sqlite3 v0.18.1
golang.org/x/image v0.19.0 golang.org/x/image v0.19.0
golang.org/x/text v0.17.0 golang.org/x/text v0.17.0
gonum.org/v1/gonum v0.15.1
modernc.org/sqlite v1.32.0 modernc.org/sqlite v1.32.0
) )
@ -40,14 +40,16 @@ require (
github.com/kr/text v0.1.0 // indirect github.com/kr/text v0.1.0 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-isatty v0.0.20 // indirect
github.com/ncruces/go-strftime v0.1.9 // indirect github.com/ncruces/go-strftime v0.1.9 // indirect
github.com/ncruces/julianday v1.0.0 // indirect
github.com/nwaples/rardecode/v2 v2.0.0-beta.2 // indirect github.com/nwaples/rardecode/v2 v2.0.0-beta.2 // indirect
github.com/pierrec/lz4/v4 v4.1.15 // indirect github.com/pierrec/lz4/v4 v4.1.15 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/tetratelabs/wazero v1.8.0 // indirect
github.com/therootcompany/xz v1.0.1 // indirect github.com/therootcompany/xz v1.0.1 // indirect
github.com/ulikunitz/xz v0.5.10 // indirect github.com/ulikunitz/xz v0.5.10 // indirect
go4.org v0.0.0-20200411211856-f5505b9728dd // indirect go4.org v0.0.0-20200411211856-f5505b9728dd // indirect
golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa // indirect golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa // indirect
golang.org/x/sys v0.22.0 // indirect golang.org/x/sys v0.24.0 // indirect
modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 // indirect modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 // indirect
modernc.org/libc v1.55.3 // indirect modernc.org/libc v1.55.3 // indirect
modernc.org/mathutil v1.6.0 // indirect modernc.org/mathutil v1.6.0 // indirect

12
go.sum
View File

@ -115,8 +115,12 @@ github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/mholt/archiver/v4 v4.0.0-alpha.8 h1:tRGQuDVPh66WCOelqe6LIGh0gwmfwxUrSSDunscGsRM= github.com/mholt/archiver/v4 v4.0.0-alpha.8 h1:tRGQuDVPh66WCOelqe6LIGh0gwmfwxUrSSDunscGsRM=
github.com/mholt/archiver/v4 v4.0.0-alpha.8/go.mod h1:5f7FUYGXdJWUjESffJaYR4R60VhnHxb2X3T1teMyv5A= github.com/mholt/archiver/v4 v4.0.0-alpha.8/go.mod h1:5f7FUYGXdJWUjESffJaYR4R60VhnHxb2X3T1teMyv5A=
github.com/ncruces/go-sqlite3 v0.18.1 h1:iN8IMZV5EMxpH88NUac9vId23eTKNFUhP7jgY0EBbNc=
github.com/ncruces/go-sqlite3 v0.18.1/go.mod h1:eEOyZnW1dGTJ+zDpMuzfYamEUBtdFz5zeYhqLBtHxvM=
github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4= github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4=
github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/ncruces/julianday v1.0.0 h1:fH0OKwa7NWvniGQtxdJRxAgkBMolni2BjDHaWTxqt7M=
github.com/ncruces/julianday v1.0.0/go.mod h1:Dusn2KvZrrovOMJuOt0TNXL6tB7U2E8kvza5fFc9G7g=
github.com/nwaples/rardecode/v2 v2.0.0-beta.2 h1:e3mzJFJs4k83GXBEiTaQ5HgSc/kOK8q0rDaRO0MPaOk= github.com/nwaples/rardecode/v2 v2.0.0-beta.2 h1:e3mzJFJs4k83GXBEiTaQ5HgSc/kOK8q0rDaRO0MPaOk=
github.com/nwaples/rardecode/v2 v2.0.0-beta.2/go.mod h1:yntwv/HfMc/Hbvtq9I19D1n58te3h6KsqCf3GxyfBGY= github.com/nwaples/rardecode/v2 v2.0.0-beta.2/go.mod h1:yntwv/HfMc/Hbvtq9I19D1n58te3h6KsqCf3GxyfBGY=
github.com/pierrec/lz4/v4 v4.1.15 h1:MO0/ucJhngq7299dKLwIMtgTfbkoSPF6AoMYDd8Q4q0= github.com/pierrec/lz4/v4 v4.1.15 h1:MO0/ucJhngq7299dKLwIMtgTfbkoSPF6AoMYDd8Q4q0=
@ -133,6 +137,8 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/tetratelabs/wazero v1.8.0 h1:iEKu0d4c2Pd+QSRieYbnQC9yiFlMS9D+Jr0LsRmcF4g=
github.com/tetratelabs/wazero v1.8.0/go.mod h1:yAI0XTsMBhREkM/YDAK/zNou3GoiAce1P6+rp/wQhjs=
github.com/therootcompany/xz v1.0.1 h1:CmOtsn1CbtmyYiusbfmhmkpAAETj0wBIH6kCYaX+xzw= github.com/therootcompany/xz v1.0.1 h1:CmOtsn1CbtmyYiusbfmhmkpAAETj0wBIH6kCYaX+xzw=
github.com/therootcompany/xz v1.0.1/go.mod h1:3K3UH1yCKgBneZYhuQUvJ9HPD19UEXEI0BWbMn8qNMY= github.com/therootcompany/xz v1.0.1/go.mod h1:3K3UH1yCKgBneZYhuQUvJ9HPD19UEXEI0BWbMn8qNMY=
github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8= github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8=
@ -232,8 +238,8 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
@ -273,6 +279,8 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8T
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0=
gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o=
google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=

View File

@ -77,7 +77,82 @@ type Hash struct {
Kind goimagehash.Kind Kind goimagehash.Kind
} }
type SavedHashes map[Source]map[string][3]uint64 // IDList is a map of domain to ID eg IDs["comicvine.gamespot.com"] = []string{"1235"}
// Maps are extremely expensive in go for small maps this should only be used to return info to a user no internal code should use this
type IDList map[Source][]string
type OldSavedHashes map[Source]map[string][3]uint64
type SavedHashes struct {
IDs [][]ID
Hashes [3]map[uint64]int
}
func ToIDList(ids []ID) IDList {
idlist := IDList{}
for _, id := range ids {
idlist[id.Domain] = Insert(idlist[id.Domain], id.ID)
}
return idlist
}
func InsertID(ids []ID, id ID) []ID {
index, itemFound := slices.BinarySearchFunc(ids, id, func(e ID, t ID) int {
return cmp.Or(
cmp.Compare(e.Domain, t.Domain),
cmp.Compare(e.ID, t.ID),
)
})
if itemFound {
return ids
}
return slices.Insert(ids, index, id)
}
func (s *SavedHashes) InsertHash(hash Hash, id ID) {
for i, h := range s.Hashes {
if h == nil {
s.Hashes[i] = make(map[uint64]int)
}
}
hashType := int(hash.Kind) - 1
idx, hashFound := s.Hashes[hashType][hash.Hash]
if !hashFound {
idx = len(s.IDs)
s.IDs = append(s.IDs, make([]ID, 0, 3))
}
s.IDs[idx] = InsertID(s.IDs[idx], id)
s.Hashes[hashType][hash.Hash] = idx
}
func ConvertSavedHashes(oldHashes OldSavedHashes) SavedHashes {
t := SavedHashes{}
idcount := 0
for _, ids := range oldHashes {
idcount += len(ids)
}
t.IDs = make([][]ID, 0, idcount)
t.Hashes[0] = make(map[uint64]int, idcount)
t.Hashes[1] = make(map[uint64]int, idcount)
t.Hashes[2] = make(map[uint64]int, idcount)
for domain, sourceHashes := range oldHashes {
for id, hashes := range sourceHashes {
idx := len(t.IDs)
t.IDs = append(t.IDs, []ID{{domain, id}})
for hashType, hash := range hashes {
t.Hashes[hashType][hash] = idx
}
}
}
fmt.Println("Expected number of IDs", idcount)
idcount = 0
for _, ids := range t.IDs {
idcount += len(ids)
}
fmt.Println("length of hashes", len(t.Hashes[0])+len(t.Hashes[1])+len(t.Hashes[2]))
fmt.Println("Length of ID lists", len(t.IDs))
fmt.Println("Total number of IDs", idcount)
return t
}
type NewIDs struct { type NewIDs struct {
OldID ID OldID ID
@ -171,5 +246,3 @@ func SplitHash(hash uint64) [8]uint8 {
uint8((hash & H0) >> Shift0), uint8((hash & H0) >> Shift0),
} }
} }
type IDList map[Source][]string // IDs is a map of domain to ID eg IDs['comicvine.gamespot.com'] = []string{"1235"}

315
map.go
View File

@ -1,100 +1,32 @@
package ch package ch
import ( import (
"cmp" "fmt"
"math/bits"
"slices" "slices"
"sync" "sync"
"gitea.narnian.us/lordwelch/goimagehash"
) )
type mapStorage struct { type MapStorage struct {
hashMutex sync.RWMutex basicMapStorage
partialHash [3][8]map[uint8][]int partialHash [3][8]map[uint8][]uint64
// partialAhash [8]map[uint8][]int
// partialDhash [8]map[uint8][]int
// partialPhash [8]map[uint8][]int
ids []ID
idToHash map[int][3][]int
hashes [3][]uint64
// ahashes []uint64
// dhashes []uint64
// phashes []uint64
hashToID [3]map[int][]int
// ahashToID map[int][]int
// dhashToID map[int][]int
// phashToID map[int][]int
} }
func (m *mapStorage) addID(id ID) int { func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
index, itemFound := slices.BinarySearchFunc(m.ids, id, func(existing, new ID) int {
return cmp.Or(
cmp.Compare(existing.Domain, new.Domain),
cmp.Compare(existing.ID, new.ID),
)
})
if itemFound {
return index
}
m.ids = slices.Insert(m.ids, index, id)
return index
}
func (m *mapStorage) getID(id ID) (int, bool) {
return slices.BinarySearchFunc(m.ids, id, func(existing, new ID) int {
return cmp.Or(
cmp.Compare(existing.Domain, new.Domain),
cmp.Compare(existing.ID, new.ID),
)
})
}
func (m *mapStorage) Atleast(hashKind goimagehash.Kind, maxDistance int, searchHash uint64, hashes []int) []Result {
hashType := int(hashKind) - 1
matchingHashes := make([]Result, 0, len(hashes)/2) // hope that we don't need all of them
for _, idx := range hashes {
storedHash := m.hashes[hashType][idx]
distance := bits.OnesCount64(searchHash ^ storedHash)
if distance <= maxDistance {
ids := make(IDList)
for _, idLocation := range m.hashToID[hashType][idx] {
ids[m.ids[idLocation].Domain] = Insert(ids[m.ids[idLocation].Domain], m.ids[idLocation].ID)
}
matchingHashes = append(matchingHashes, Result{ids, distance, Hash{storedHash, hashKind}})
}
}
return matchingHashes
}
func (m *mapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
var foundMatches []Result var foundMatches []Result
m.hashMutex.RLock() m.hashMutex.RLock()
defer m.hashMutex.RUnlock() defer m.hashMutex.RUnlock()
resetTime()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
for _, hash := range hashes { for _, hash := range hashes {
hashType := int(hash.Kind) - 1 hashType := int(hash.Kind) - 1
if hashLocation, found := slices.BinarySearch(m.hashes[hashType], hash.Hash); found { idlist := m.hashes[hashType][hash.Hash]
idlist := make(IDList) if idlist != nil && len(*idlist) > 0 {
for _, idLocation := range m.hashToID[hashType][hashLocation] { foundMatches = append(foundMatches, Result{
Distance: 0,
for _, hashLocation := range m.idToHash[idLocation][0] { Hash: hash,
for _, foundIDLocation := range m.hashToID[hashType][hashLocation] { IDs: ToIDList(*idlist),
foundID := m.ids[foundIDLocation] })
idlist[foundID.Domain] = Insert(idlist[foundID.Domain], foundID.ID)
}
}
}
if len(idlist) > 0 {
foundMatches = append(foundMatches, Result{
Distance: 0,
Hash: hash,
})
}
} }
} }
@ -102,173 +34,114 @@ func (m *mapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul
if len(foundMatches) > 0 && exactOnly { if len(foundMatches) > 0 && exactOnly {
return foundMatches, nil return foundMatches, nil
} }
logTime("Search Exact")
} }
foundHashes := make(map[uint64]struct{}) totalPartialHashes := 0
for _, hash := range hashes { for _, searchHash := range hashes {
if hash.Hash == 0 { foundHashes := make(map[uint64]struct{})
continue hashType := int(searchHash.Kind) - 1
} for i, partialHash := range SplitHash(searchHash.Hash) {
hashType := int(hash.Kind) - 1 partialHashes := m.partialHash[hashType][i][partialHash]
for i, partialHash := range SplitHash(hash.Hash) { totalPartialHashes += len(partialHashes)
for _, match := range m.Atleast(hash.Kind, max, hash.Hash, m.partialHash[hashType][i][partialHash]) { for _, match := range Atleast(max, searchHash.Hash, partialHashes) {
_, alreadyMatched := foundHashes[match.Hash.Hash] _, alreadyMatched := foundHashes[match.Hash]
if alreadyMatched { if matchedResults, ok := m.hashes[hashType][match.Hash]; ok && !alreadyMatched {
continue foundHashes[match.Hash] = struct{}{}
foundMatches = append(foundMatches, Result{IDs: ToIDList(*matchedResults), Distance: match.Distance, Hash: Hash{Hash: match.Hash, Kind: searchHash.Kind}})
} }
foundMatches = append(foundMatches, match)
} }
} }
} }
fmt.Println("Total partial hashes tested:", totalPartialHashes)
logTime("Search Complete")
go m.printSizes()
return foundMatches, nil return foundMatches, nil
} }
func (m *mapStorage) MapHashes(hash ImageHash) { func (m *MapStorage) MapHashes(hash ImageHash) {
m.basicMapStorage.MapHashes(hash)
idIndex := m.addID(hash.ID)
idHashes := m.idToHash[idIndex]
for _, hash := range hash.Hashes { for _, hash := range hash.Hashes {
var ( hashType := int(hash.Kind) - 1
hashIndex int
hashType = int(hash.Kind) - 1
)
m.hashes[hashType], hashIndex = InsertIdx(m.hashes[hashType], hash.Hash)
for i, partialHash := range SplitHash(hash.Hash) { for i, partialHash := range SplitHash(hash.Hash) {
m.partialHash[hashType][i][partialHash] = append(m.partialHash[hashType][i][partialHash], hashIndex) m.partialHash[hashType][i][partialHash] = Insert(m.partialHash[hashType][i][partialHash], hash.Hash)
} }
idHashes[hashType] = Insert(idHashes[hashType], hashIndex)
m.hashToID[hashType][hashIndex] = Insert(m.hashToID[hashType][hashIndex], idIndex)
} }
m.idToHash[idIndex] = idHashes
} }
func (m *mapStorage) DecodeHashes(hashes SavedHashes) error { func (m *MapStorage) DecodeHashes(hashes SavedHashes) error {
for hashType, sourceHashes := range hashes.Hashes {
for _, sourceHashes := range hashes { m.hashes[hashType] = make(map[uint64]*[]ID, len(sourceHashes))
m.hashes[0] = make([]uint64, 0, len(sourceHashes)) for savedHash, idlistLocation := range sourceHashes {
m.hashes[1] = make([]uint64, 0, len(sourceHashes)) for i, partialHash := range SplitHash(savedHash) {
m.hashes[2] = make([]uint64, 0, len(sourceHashes)) m.partialHash[hashType][i][partialHash] = append(m.partialHash[hashType][i][partialHash], savedHash)
break }
m.hashes[hashType][savedHash] = &hashes.IDs[idlistLocation]
}
} }
for domain, sourceHashes := range hashes { m.printSizes()
for id, h := range sourceHashes { for _, partialHashes := range m.partialHash {
m.ids = append(m.ids, ID{Domain: Source(domain), ID: id}) for _, partMap := range partialHashes {
for part, hashes := range partMap {
for _, hash := range []Hash{Hash{h[0], goimagehash.AHash}, Hash{h[1], goimagehash.DHash}, Hash{h[2], goimagehash.PHash}} { slices.Sort(hashes)
var ( partMap[part] = slices.Compact(hashes)
hashType = int(hash.Kind) - 1
)
m.hashes[hashType] = append(m.hashes[hashType], hash.Hash)
} }
} }
} }
slices.SortFunc(m.ids, func(existing, new ID) int { m.printSizes()
return cmp.Or(
cmp.Compare(existing.Domain, new.Domain),
cmp.Compare(existing.ID, new.ID),
)
})
slices.Sort(m.hashes[0])
slices.Sort(m.hashes[1])
slices.Sort(m.hashes[2])
for domain, sourceHashes := range hashes {
for id, h := range sourceHashes {
m.MapHashes(ImageHash{
Hashes: []Hash{{h[0], goimagehash.AHash}, {h[1], goimagehash.DHash}, {h[2], goimagehash.PHash}},
ID: ID{Domain: Source(domain), ID: id},
})
}
}
return nil return nil
} }
func (m *mapStorage) EncodeHashes() (SavedHashes, error) { func (m *MapStorage) printSizes() {
hashes := make(SavedHashes) fmt.Println("Length of hashes:", len(m.hashes[0])+len(m.hashes[1])+len(m.hashes[2]))
for idLocation, hashLocation := range m.idToHash { // fmt.Println("Size of", "hashes:", size.Of(m.hashes)/1024/1024, "MB")
id := m.ids[idLocation] // fmt.Println("Size of", "ids:", size.Of(m.ids)/1024/1024, "MB")
_, ok := hashes[id.Domain] // fmt.Println("Size of", "MapStorage:", size.Of(m)/1024/1024, "MB")
if !ok {
hashes[id.Domain] = make(map[string][3]uint64)
}
// TODO: Add all hashes. Currently saved hashes does not allow multiple IDs for a single hash
hashes[id.Domain][id.ID] = [3]uint64{
m.hashes[0][hashLocation[0][0]],
m.hashes[1][hashLocation[1][0]],
m.hashes[2][hashLocation[2][0]],
}
}
return hashes, nil
}
func (m *mapStorage) AssociateIDs(newids []NewIDs) {
for _, ids := range newids {
oldIDLocation, found := m.getID(ids.OldID)
if !found {
msg := "No IDs belonging to " + ids.OldID.Domain + "exist on this server"
panic(msg)
}
newIDLocation := m.addID(ids.NewID)
for _, hashType := range []int{int(goimagehash.AHash), int(goimagehash.DHash), int(goimagehash.PHash)} {
for _, hashLocation := range m.idToHash[oldIDLocation][hashType] {
m.hashToID[hashType][hashLocation] = Insert(m.hashToID[hashType][hashLocation], newIDLocation)
idHashes := m.idToHash[newIDLocation]
idHashes[hashType] = Insert(idHashes[hashType], hashLocation)
m.idToHash[newIDLocation] = idHashes
}
}
}
}
func (m *mapStorage) GetIDs(id ID) IDList {
idIndex, found := m.getID(id)
if !found {
msg := "No IDs belonging to " + id.Domain + "exist on this server"
panic(msg)
}
ids := make(IDList)
for _, hashLocation := range m.idToHash[idIndex][0] {
for _, foundIDLocation := range m.hashToID[0][hashLocation] {
foundID := m.ids[foundIDLocation]
ids[foundID.Domain] = Insert(ids[foundID.Domain], foundID.ID)
}
}
for _, hashLocation := range m.idToHash[idIndex][1] {
for _, foundIDLocation := range m.hashToID[1][hashLocation] {
foundID := m.ids[foundIDLocation]
ids[foundID.Domain] = Insert(ids[foundID.Domain], foundID.ID)
}
}
for _, hashLocation := range m.idToHash[idIndex][2] {
for _, foundIDLocation := range m.hashToID[2][hashLocation] {
foundID := m.ids[foundIDLocation]
ids[foundID.Domain] = Insert(ids[foundID.Domain], foundID.ID)
}
}
return ids
} }
func NewMapStorage() (HashStorage, error) { func NewMapStorage() (HashStorage, error) {
storage := &mapStorage{ storage := &MapStorage{
hashMutex: sync.RWMutex{}, basicMapStorage: basicMapStorage{
idToHash: make(map[int][3][]int), hashMutex: sync.RWMutex{},
hashToID: [3]map[int][]int{ hashes: [3]map[uint64]*[]ID{
make(map[int][]int), make(map[uint64]*[]ID),
make(map[int][]int), make(map[uint64]*[]ID),
make(map[int][]int), make(map[uint64]*[]ID),
},
},
partialHash: [3][8]map[uint8][]uint64{
{
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
},
{
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
},
{
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
},
}, },
}
for i := range storage.partialHash[0] {
storage.partialHash[0][i] = make(map[uint8][]int)
}
for i := range storage.partialHash[1] {
storage.partialHash[1][i] = make(map[uint8][]int)
}
for i := range storage.partialHash[2] {
storage.partialHash[2][i] = make(map[uint8][]int)
} }
return storage, nil return storage, nil
} }

131
sqlite.go
View File

@ -8,6 +8,7 @@ import (
"log" "log"
"math/bits" "math/bits"
"strings" "strings"
"time"
"gitea.narnian.us/lordwelch/goimagehash" "gitea.narnian.us/lordwelch/goimagehash"
_ "modernc.org/sqlite" _ "modernc.org/sqlite"
@ -67,11 +68,11 @@ func (s *sqliteStorage) findExactHashes(statement *sql.Stmt, items ...interface{
func (s *sqliteStorage) findPartialHashes(max int, search_hash int64, kind goimagehash.Kind) ([]sqliteHash, error) { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate func (s *sqliteStorage) findPartialHashes(max int, search_hash int64, kind goimagehash.Kind) ([]sqliteHash, error) { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
hashes := []sqliteHash{} hashes := []sqliteHash{}
statement, err := s.db.PrepareContext(context.Background(), `SELECT rowid,hash,kind FROM Hashes WHERE (kind=?) AND (((hash >> (0 * 8) & 0xFF)=(? >> (0 * 8) & 0xFF)) OR ((hash >> (1 * 8) & 0xFF)=(? >> (1 * 8) & 0xFF)) OR ((hash >> (2 * 8) & 0xFF)=(? >> (2 * 8) & 0xFF)) OR ((hash >> (3 * 8) & 0xFF)=(? >> (3 * 8) & 0xFF)) OR ((hash >> (4 * 8) & 0xFF)=(? >> (4 * 8) & 0xFF)) OR ((hash >> (5 * 8) & 0xFF)=(? >> (5 * 8) & 0xFF)) OR ((hash >> (6 * 8) & 0xFF)=(? >> (6 * 8) & 0xFF)) OR ((hash >> (7 * 8) & 0xFF)=(? >> (7 * 8) & 0xFF))) ORDER BY kind,hash;`) statement, err := s.db.PrepareContext(context.Background(), `SELECT rowid,hash,kind FROM Hashes WHERE (kind=?) AND (((hash >> (0 * 8) & 0xFF)=(?2 >> (0 * 8) & 0xFF)) OR ((hash >> (1 * 8) & 0xFF)=(?2 >> (1 * 8) & 0xFF)) OR ((hash >> (2 * 8) & 0xFF)=(?2 >> (2 * 8) & 0xFF)) OR ((hash >> (3 * 8) & 0xFF)=(?2 >> (3 * 8) & 0xFF)) OR ((hash >> (4 * 8) & 0xFF)=(?2 >> (4 * 8) & 0xFF)) OR ((hash >> (5 * 8) & 0xFF)=(?2 >> (5 * 8) & 0xFF)) OR ((hash >> (6 * 8) & 0xFF)=(?2 >> (6 * 8) & 0xFF)) OR ((hash >> (7 * 8) & 0xFF)=(?2 >> (7 * 8) & 0xFF)));`)
if err != nil { if err != nil {
return hashes, err return hashes, err
} }
rows, err := statement.Query(kind, int64(search_hash), int64(search_hash), int64(search_hash), int64(search_hash), int64(search_hash), int64(search_hash), int64(search_hash), int64(search_hash)) rows, err := statement.Query(kind, int64(search_hash))
if err != nil { if err != nil {
return hashes, err return hashes, err
} }
@ -93,6 +94,7 @@ func (s *sqliteStorage) findPartialHashes(max int, search_hash int64, kind goima
} }
} }
rows.Close() rows.Close()
logTime("Filter partial " + kind.String())
statement, err = s.db.PrepareContext(context.Background(), `SELECT DISTINCT IDS.domain, IDs.id, id_hash.hashid FROM IDs JOIN id_hash ON IDs.rowid = id_hash.idid WHERE (id_hash.hashid in (`+strings.TrimRight(strings.Repeat("?,", len(hashes)), ",")+`)) ORDER BY IDs.domain, IDs.ID;`) statement, err = s.db.PrepareContext(context.Background(), `SELECT DISTINCT IDS.domain, IDs.id, id_hash.hashid FROM IDs JOIN id_hash ON IDs.rowid = id_hash.idid WHERE (id_hash.hashid in (`+strings.TrimRight(strings.Repeat("?,", len(hashes)), ",")+`)) ORDER BY IDs.domain, IDs.ID;`)
if err != nil { if err != nil {
@ -161,6 +163,7 @@ CREATE INDEX IF NOT EXISTS hash_8_index ON Hashes ((hash >> (7 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS id_domain ON IDs (domain, id); CREATE INDEX IF NOT EXISTS id_domain ON IDs (domain, id);
PRAGMA shrink_memory; PRAGMA shrink_memory;
ANALYZE;
`) `)
if err != nil { if err != nil {
return err return err
@ -168,15 +171,38 @@ PRAGMA shrink_memory;
return nil return nil
} }
var (
total time.Duration
t = time.Now()
)
func resetTime() {
total = 0
t = time.Now()
}
func logTime(log string) {
n := time.Now()
s := n.Sub(t)
t = n
total += s
fmt.Printf("total: %v, %s: %v\n", total, log, s)
}
func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) { func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
var foundMatches []Result var (
foundMatches []Result
)
resetTime()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
statement, err := s.db.Prepare(`SELECT rowid,hash,kind FROM Hashes WHERE ` + strings.TrimSuffix(strings.Repeat("(hash=? AND kind=?) OR", len(hashes)), "OR") + `ORDER BY kind,hash;`) statement, err := s.db.Prepare(`SELECT rowid,hash,kind FROM Hashes WHERE ` + strings.TrimSuffix(strings.Repeat("(hash=? AND kind=?) OR", len(hashes)), "OR") + `ORDER BY kind,hash;`)
if err != nil { if err != nil {
logTime("Fail exact")
return foundMatches, err return foundMatches, err
} }
args := make([]interface{}, 0, len(hashes)*2) args := make([]interface{}, 0, len(hashes)*2)
for _, hash := range hashes { for _, hash := range hashes {
if hash.Hash != 0 { if hash.Hash != 0 {
@ -195,6 +221,7 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re
if len(foundMatches) > 0 && exactOnly { if len(foundMatches) > 0 && exactOnly {
return foundMatches, nil return foundMatches, nil
} }
logTime("Search Exact")
} }
foundHashes := make(map[uint64]struct{}) foundHashes := make(map[uint64]struct{})
@ -204,6 +231,7 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re
if err != nil { if err != nil {
return foundMatches, err return foundMatches, err
} }
logTime("Search partial " + hash.Kind.String())
for _, hash := range hashes { for _, hash := range hashes {
if _, alreadyMatched := foundHashes[hash.Hash.Hash]; !alreadyMatched { if _, alreadyMatched := foundHashes[hash.Hash.Hash]; !alreadyMatched {
@ -219,14 +247,18 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re
} }
func (s *sqliteStorage) MapHashes(hash ImageHash) { func (s *sqliteStorage) MapHashes(hash ImageHash) {
insertHashes, err := s.db.Prepare(` tx, err := s.db.BeginTx(context.Background(), nil)
INSERT INTO Hashes (hash,kind) VALUES (?,?) ON CONFLICT DO UPDATE SET hash=?1 RETURNING hashid; if err != nil {
panic(err)
}
insertHashes, err := tx.Prepare(`
INSERT INTO Hashes (hash,kind) VALUES (?,?) ON CONFLICT DO UPDATE SET hash=?1 RETURNING hashid
`) `)
if err != nil { if err != nil {
panic(err) panic(err)
} }
rows, err := s.db.Query(` rows, err := tx.Query(`
INSERT INTO IDs (domain,id) VALUES (?,?) ON CONFLICT DO UPDATE SET domain=?1 RETURNING idid; INSERT INTO IDs (domain,id) VALUES (?,?) ON CONFLICT DO UPDATE SET domain=?1 RETURNING idid
`, hash.ID.Domain, hash.ID.ID) `, hash.ID.Domain, hash.ID.ID)
if err != nil { if err != nil {
panic(err) panic(err)
@ -258,12 +290,19 @@ INSERT INTO IDs (domain,id) VALUES (?,?) ON CONFLICT DO UPDATE SET domain=?1 RET
} }
hash_ids = append(hash_ids, id) hash_ids = append(hash_ids, id)
} }
var ids []any
for _, hash_id := range hash_ids { for _, hash_id := range hash_ids {
_, err = s.db.Exec(`INSERT INTO id_hash (hashid,idid) VALUES (?, ?) ON CONFLICT DO NOTHING;`, hash_id, id_id) ids = append(ids, hash_id, id_id)
if err != nil {
panic(fmt.Errorf("Failed inserting: %v,%v: %w", hash.ID.Domain, hash.ID.ID, err))
}
} }
_, err = tx.Exec(`INSERT INTO id_hash (hashid,idid) VALUES `+strings.TrimSuffix(strings.Repeat("(?, ?),", len(hash_ids)), ",")+` ON CONFLICT DO NOTHING;`, ids...)
if err != nil {
panic(fmt.Errorf("Failed inserting: %v,%v: %w", hash.ID.Domain, hash.ID.ID, err))
}
err = tx.Commit()
if err != nil {
panic(err)
}
insertHashes.Close()
} }
func (s *sqliteStorage) DecodeHashes(hashes SavedHashes) error { func (s *sqliteStorage) DecodeHashes(hashes SavedHashes) error {
@ -272,9 +311,15 @@ func (s *sqliteStorage) DecodeHashes(hashes SavedHashes) error {
return err return err
} }
for domain, sourceHashes := range hashes { for hashType, sourceHashes := range hashes.Hashes {
for id, h := range sourceHashes { hashKind := goimagehash.Kind(hashType + 1)
s.MapHashes(ImageHash{[]Hash{{h[0], goimagehash.AHash}, {h[1], goimagehash.DHash}, {h[2], goimagehash.PHash}}, ID{domain, id}}) for hash, idsLocations := range sourceHashes {
for _, id := range hashes.IDs[idsLocations] {
s.MapHashes(ImageHash{
Hashes: []Hash{{hash, hashKind}},
ID: id,
})
}
} }
} }
err = s.createIndexes() err = s.createIndexes()
@ -285,48 +330,27 @@ func (s *sqliteStorage) DecodeHashes(hashes SavedHashes) error {
} }
func (s *sqliteStorage) EncodeHashes() (SavedHashes, error) { func (s *sqliteStorage) EncodeHashes() (SavedHashes, error) {
hashes := make(SavedHashes) hashes := SavedHashes{}
conn, err := s.db.Conn(context.Background()) conn, err := s.db.Conn(context.Background())
if err != nil { if err != nil {
return hashes, err return hashes, err
} }
defer conn.Close() defer conn.Close()
rows, err := conn.QueryContext(context.Background(), "SELECT DISTINCT (domain) FROM IDs ORDER BY domain;") rows, err := conn.QueryContext(context.Background(), "SELECT IDs.domain,IDs.id,Hashes.hash,Hashes.kind FROM Hashes JOIN id_hash ON id_hash.hashid = hashes.rowid JOIN IDs ON IDs.rowid = id_hash.idid ORDER BY IDs.ID,Hashes.kind,Hashes.hash;")
if err != nil {
rows.Close()
return hashes, err
}
var (
id ID
hash Hash
)
err = rows.Scan(&id.Domain, &id.ID, &hash.Hash, &hash.Kind)
if err != nil { if err != nil {
return hashes, err return hashes, err
} }
sources := make([]string, 0, 10) hashes.InsertHash(hash, id)
for rows.Next() {
var source string
if err = rows.Scan(&source); err != nil {
rows.Close()
return hashes, err
}
sources = append(sources, source)
}
for _, source := range sources {
rows, err = conn.QueryContext(context.Background(), "SELECT IDs.id,Hashes.hash,Hashes.kind FROM Hashes JOIN id_hash ON id_hash.hashid = hashes.rowid JOIN IDs ON IDs.rowid = id_hash.idid WHERE IDs.domain = ? ORDER BY IDs.ID,Hashes.kind,Hashes.hash;", source)
if err != nil {
rows.Close()
return hashes, err
}
var (
id string
hash int64
typ goimagehash.Kind
)
err = rows.Scan(&id, &hash, &typ)
if err != nil {
return hashes, err
}
_, ok := hashes[Source(source)]
if !ok {
hashes[Source(source)] = make(map[string][3]uint64)
}
h := hashes[Source(source)][id]
h[typ-1] = uint64(hash)
hashes[Source(source)][id] = h
}
return hashes, nil return hashes, nil
} }
@ -415,16 +439,6 @@ CREATE TABLE IF NOT EXISTS Hashes(
UNIQUE(kind, hash) UNIQUE(kind, hash)
); );
CREATE INDEX IF NOT EXISTS hash_index ON Hashes (kind, hash);
CREATE INDEX IF NOT EXISTS hash_1_index ON Hashes ((hash >> (0 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_2_index ON Hashes ((hash >> (1 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_3_index ON Hashes ((hash >> (2 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_4_index ON Hashes ((hash >> (3 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_5_index ON Hashes ((hash >> (4 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_6_index ON Hashes ((hash >> (5 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_7_index ON Hashes ((hash >> (6 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_8_index ON Hashes ((hash >> (7 * 8) & 0xFF));
CREATE TABLE IF NOT EXISTS IDs( CREATE TABLE IF NOT EXISTS IDs(
id TEXT NOT NULL, id TEXT NOT NULL,
domain TEXT NOT NULL, domain TEXT NOT NULL,
@ -445,6 +459,7 @@ CREATE TABLE IF NOT EXISTS id_hash(
if err != nil { if err != nil {
panic(err) panic(err)
} }
sqlite.createIndexes()
sqlite.db.SetMaxOpenConns(1) sqlite.db.SetMaxOpenConns(1)
return sqlite, nil return sqlite, nil
} }

8
sqlite_no_cgo.go Normal file
View File

@ -0,0 +1,8 @@
//go:build !cgo
package ch
import (
_ "github.com/ncruces/go-sqlite3/driver"
_ "github.com/ncruces/go-sqlite3/embed"
)

105
vp-tree.go Normal file
View File

@ -0,0 +1,105 @@
package ch
import (
"errors"
"fmt"
"math/bits"
"gitea.narnian.us/lordwelch/goimagehash"
"gonum.org/v1/gonum/spatial/vptree"
)
type VPTree struct {
trees [3]*vptree.Tree
hashes [3][]vptree.Comparable
}
type VPHash struct {
Hash Hash
IDs []ID
}
func (h *VPHash) Distance(c vptree.Comparable) float64 {
h2, ok := c.(*VPHash)
if !ok {
return -99
}
return float64(bits.OnesCount64(h.Hash.Hash ^ h2.Hash.Hash))
}
func (v *VPTree) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
var matches []Result
var exactMatches []Result
fmt.Println(hashes)
for _, hash := range hashes {
results := vptree.NewDistKeeper(float64(max))
hashType := int(hash.Kind) - 1
v.trees[hashType].NearestSet(results, &VPHash{Hash: hash})
for _, result := range results.Heap {
vphash := result.Comparable.(*VPHash)
if result.Dist == 0 {
exactMatches = append(exactMatches, Result{
IDs: ToIDList(vphash.IDs),
Distance: int(result.Dist),
Hash: vphash.Hash,
})
} else {
matches = append(matches, Result{
IDs: ToIDList(vphash.IDs),
Distance: int(result.Dist),
Hash: vphash.Hash,
})
}
}
}
if len(exactMatches) > 0 && exactOnly {
return exactMatches, nil
}
matches = append(exactMatches[:len(exactMatches):len(exactMatches)], matches...)
return matches, nil
}
func (v *VPTree) MapHashes(ImageHash) {
panic("Not Implemented")
}
func (v *VPTree) DecodeHashes(hashes SavedHashes) error {
var err error
for hashType, sourceHashes := range hashes.Hashes {
for hash, idsLocation := range sourceHashes {
var (
hashKind = goimagehash.Kind(hashType + 1)
)
hash := &VPHash{Hash{hash, hashKind}, hashes.IDs[idsLocation]}
v.hashes[hashType] = append(v.hashes[hashType], hash)
}
}
for hashType := range 3 {
v.trees[hashType], err = vptree.New(v.hashes[hashType], 3, nil)
if err != nil {
return err
}
}
return nil
}
func (v *VPTree) EncodeHashes() (SavedHashes, error) {
return SavedHashes{}, errors.New("Not Implemented")
}
func (v *VPTree) AssociateIDs(newIDs []NewIDs) {
panic("Not Implemented")
}
func (v *VPTree) GetIDs(id ID) IDList {
return nil
}
func NewVPStorage() (HashStorage, error) {
return &VPTree{
hashes: [3][]vptree.Comparable{
make([]vptree.Comparable, 0, 1_000_000),
make([]vptree.Comparable, 0, 1_000_000),
make([]vptree.Comparable, 0, 1_000_000),
},
}, nil
}