Optimize iterating hashes

This commit is contained in:
Timmy Welch 2024-09-12 11:42:29 -07:00
parent 0928ed6ccf
commit 095c78f0e7
2 changed files with 66 additions and 34 deletions

View File

@ -1,8 +1,10 @@
package ch package ch
import ( import (
"cmp"
"fmt" "fmt"
"math/bits" "math/bits"
"slices"
"sync" "sync"
"gitea.narnian.us/lordwelch/goimagehash" "gitea.narnian.us/lordwelch/goimagehash"
@ -12,16 +14,21 @@ type basicMapStorage struct {
hashMutex sync.RWMutex hashMutex sync.RWMutex
ids map[ID]*[]ID ids map[ID]*[]ID
hashes [3]map[uint64]*[]ID hashes [3][]structHash
}
type structHash struct {
hash uint64
ids *[]ID
} }
func (b *basicMapStorage) Atleast(hashKind goimagehash.Kind, maxDistance int, searchHash uint64) []Result { func (b *basicMapStorage) Atleast(hashKind goimagehash.Kind, maxDistance int, searchHash uint64) []Result {
hashType := int(hashKind) - 1 hashType := int(hashKind) - 1
matchingHashes := make([]Result, 0, 100) // hope that we don't need all of them matchingHashes := make([]Result, 0, 100) // hope that we don't need all of them
for storedHash, ids := range b.hashes[hashType] { for _, storedHash := range b.hashes[hashType] {
distance := bits.OnesCount64(searchHash ^ storedHash) distance := bits.OnesCount64(searchHash ^ storedHash.hash)
if distance <= maxDistance { if distance <= maxDistance {
matchingHashes = append(matchingHashes, Result{ToIDList(*ids), distance, Hash{storedHash, hashKind}}) matchingHashes = append(matchingHashes, Result{ToIDList(*storedHash.ids), distance, Hash{storedHash.hash, hashKind}})
} }
} }
return matchingHashes return matchingHashes
@ -31,25 +38,26 @@ func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]
b.hashMutex.RLock() b.hashMutex.RLock()
defer b.hashMutex.RUnlock() defer b.hashMutex.RUnlock()
resetTime() resetTime()
defer logTime(fmt.Sprintf("Search Complete: max: %v ExactOnly: %v", max, exactOnly))
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
for _, hash := range hashes { for _, hash := range hashes {
hashType := int(hash.Kind) - 1 hashType := int(hash.Kind) - 1
ids := b.hashes[hashType][hash.Hash] index, hashFound := b.findHash(hashType, hash.Hash)
if ids != nil && len(*ids) > 0 { if hashFound {
foundMatches = append(foundMatches, Result{ foundMatches = append(foundMatches, Result{
Distance: 0, Distance: 0,
Hash: hash, Hash: hash,
IDs: ToIDList(*ids), IDs: ToIDList(*b.hashes[hashType][index].ids),
}) })
} }
} }
logTime("Search Exact")
// If we have exact matches don't bother with other matches // If we have exact matches don't bother with other matches
if len(foundMatches) > 0 && exactOnly { if len(foundMatches) > 0 && exactOnly {
return foundMatches, nil return foundMatches, nil
} }
logTime("Search Exact")
} }
foundHashes := make(map[uint64]struct{}) foundHashes := make(map[uint64]struct{})
@ -66,28 +74,49 @@ func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]
} }
fmt.Println("Total partial hashes tested:", totalPartialHashes, len(foundHashes)) fmt.Println("Total partial hashes tested:", totalPartialHashes, len(foundHashes))
logTime("Search Complete")
go b.printSizes() go b.printSizes()
return foundMatches, nil return foundMatches, nil
} }
func (b *basicMapStorage) findHash(hashType int, hash uint64) (int, bool) {
return slices.BinarySearchFunc(b.hashes[hashType], hash, func(e structHash, t uint64) int {
return cmp.Compare(e.hash, t)
})
}
func (b *basicMapStorage) InsertHash(hashType int, hash uint64, ids *[]ID) {
index, hashFound := b.findHash(hashType, hash)
if hashFound {
return
}
slices.Insert(b.hashes[hashType], index, structHash{hash, ids})
}
func (b *basicMapStorage) MapHashes(hash ImageHash) { func (b *basicMapStorage) MapHashes(hash ImageHash) {
for _, ih := range hash.Hashes { for _, ih := range hash.Hashes {
var ( var (
hashType = int(ih.Kind) - 1 hashType = int(ih.Kind) - 1
) )
ids, ok := b.ids[hash.ID]
if !ok {
ids = &[]ID{hash.ID}
}
*b.hashes[hashType][ih.Hash] = InsertID((*b.hashes[hashType][ih.Hash]), hash.ID) b.InsertHash(hashType, ih.Hash, ids)
} }
} }
func (b *basicMapStorage) DecodeHashes(hashes SavedHashes) error { func (b *basicMapStorage) DecodeHashes(hashes SavedHashes) error {
for hashType, sourceHashes := range hashes.Hashes { for hashType, sourceHashes := range hashes.Hashes {
b.hashes[hashType] = make(map[uint64]*[]ID, len(sourceHashes)) b.hashes[hashType] = make([]structHash, len(sourceHashes))
for savedHash, idlistLocation := range sourceHashes { for savedHash, idlistLocation := range sourceHashes {
b.hashes[hashType][savedHash] = &hashes.IDs[idlistLocation] b.hashes[hashType] = append(b.hashes[hashType], structHash{savedHash, &hashes.IDs[idlistLocation]})
} }
} }
for hashType := range b.hashes {
slices.SortFunc(b.hashes[hashType], func(a, b structHash) int {
return cmp.Compare(a.hash, b.hash)
})
}
b.printSizes() b.printSizes()
return nil return nil
} }
@ -110,8 +139,8 @@ func (b *basicMapStorage) EncodeHashes() (SavedHashes, error) {
idmap[ids] = len(hashes.IDs) idmap[ids] = len(hashes.IDs)
} }
for hashType, hashToID := range b.hashes { for hashType, hashToID := range b.hashes {
for hash, ids := range hashToID { for _, hash := range hashToID {
hashes.Hashes[hashType][hash] = idmap[ids] hashes.Hashes[hashType][hash.hash] = idmap[hash.ids]
} }
} }
return hashes, nil return hashes, nil
@ -141,10 +170,10 @@ func NewBasicMapStorage() (HashStorage, error) {
storage := &basicMapStorage{ storage := &basicMapStorage{
hashMutex: sync.RWMutex{}, hashMutex: sync.RWMutex{},
hashes: [3]map[uint64]*[]ID{ hashes: [3][]structHash{
make(map[uint64]*[]ID), []structHash{},
make(map[uint64]*[]ID), []structHash{},
make(map[uint64]*[]ID), []structHash{},
}, },
} }
return storage, nil return storage, nil

35
map.go
View File

@ -1,6 +1,7 @@
package ch package ch
import ( import (
"cmp"
"fmt" "fmt"
"slices" "slices"
"sync" "sync"
@ -16,25 +17,26 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul
m.hashMutex.RLock() m.hashMutex.RLock()
defer m.hashMutex.RUnlock() defer m.hashMutex.RUnlock()
resetTime() resetTime()
defer logTime("Search Complete")
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
for _, hash := range hashes { for _, hash := range hashes {
hashType := int(hash.Kind) - 1 hashType := int(hash.Kind) - 1
idlist := m.hashes[hashType][hash.Hash] index, hashFound := m.findHash(hashType, hash.Hash)
if idlist != nil && len(*idlist) > 0 { if hashFound {
foundMatches = append(foundMatches, Result{ foundMatches = append(foundMatches, Result{
Distance: 0, Distance: 0,
Hash: hash, Hash: hash,
IDs: ToIDList(*idlist), IDs: ToIDList(*m.hashes[hashType][index].ids),
}) })
} }
} }
// If we have exact matches don't bother with other matches // If we have exact matches don't bother with other matches
logTime("Search Exact")
if len(foundMatches) > 0 && exactOnly { if len(foundMatches) > 0 && exactOnly {
return foundMatches, nil return foundMatches, nil
} }
logTime("Search Exact")
} }
totalPartialHashes := 0 totalPartialHashes := 0
@ -46,15 +48,14 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul
totalPartialHashes += len(partialHashes) totalPartialHashes += len(partialHashes)
for _, match := range Atleast(max, searchHash.Hash, partialHashes) { for _, match := range Atleast(max, searchHash.Hash, partialHashes) {
_, alreadyMatched := foundHashes[match.Hash] _, alreadyMatched := foundHashes[match.Hash]
if matchedResults, ok := m.hashes[hashType][match.Hash]; ok && !alreadyMatched { if index, hashFound := m.findHash(hashType, match.Hash); hashFound && !alreadyMatched {
foundHashes[match.Hash] = struct{}{} foundHashes[match.Hash] = struct{}{}
foundMatches = append(foundMatches, Result{IDs: ToIDList(*matchedResults), Distance: match.Distance, Hash: Hash{Hash: match.Hash, Kind: searchHash.Kind}}) foundMatches = append(foundMatches, Result{IDs: ToIDList(*m.hashes[hashType][index].ids), Distance: match.Distance, Hash: Hash{Hash: match.Hash, Kind: searchHash.Kind}})
} }
} }
} }
} }
fmt.Println("Total partial hashes tested:", totalPartialHashes) fmt.Println("Total partial hashes tested:", totalPartialHashes)
logTime("Search Complete")
go m.printSizes() go m.printSizes()
return foundMatches, nil return foundMatches, nil
} }
@ -71,14 +72,16 @@ func (m *MapStorage) MapHashes(hash ImageHash) {
func (m *MapStorage) DecodeHashes(hashes SavedHashes) error { func (m *MapStorage) DecodeHashes(hashes SavedHashes) error {
for hashType, sourceHashes := range hashes.Hashes { for hashType, sourceHashes := range hashes.Hashes {
m.hashes[hashType] = make(map[uint64]*[]ID, len(sourceHashes)) m.hashes[hashType] = make([]structHash, len(sourceHashes))
for savedHash, idlistLocation := range sourceHashes { for savedHash, idlistLocation := range sourceHashes {
for i, partialHash := range SplitHash(savedHash) { m.hashes[hashType] = append(m.hashes[hashType], structHash{savedHash, &hashes.IDs[idlistLocation]})
m.partialHash[hashType][i][partialHash] = append(m.partialHash[hashType][i][partialHash], savedHash)
}
m.hashes[hashType][savedHash] = &hashes.IDs[idlistLocation]
} }
} }
for hashType := range m.hashes {
slices.SortFunc(m.hashes[hashType], func(a, b structHash) int {
return cmp.Compare(a.hash, b.hash)
})
}
m.printSizes() m.printSizes()
for _, partialHashes := range m.partialHash { for _, partialHashes := range m.partialHash {
for _, partMap := range partialHashes { for _, partMap := range partialHashes {
@ -104,10 +107,10 @@ func NewMapStorage() (HashStorage, error) {
storage := &MapStorage{ storage := &MapStorage{
basicMapStorage: basicMapStorage{ basicMapStorage: basicMapStorage{
hashMutex: sync.RWMutex{}, hashMutex: sync.RWMutex{},
hashes: [3]map[uint64]*[]ID{ hashes: [3][]structHash{
make(map[uint64]*[]ID), []structHash{},
make(map[uint64]*[]ID), []structHash{},
make(map[uint64]*[]ID), []structHash{},
}, },
}, },
partialHash: [3][8]map[uint8][]uint64{ partialHash: [3][8]map[uint8][]uint64{