From 095c78f0e7bb1e7d95465da3d382dde7899dd76b Mon Sep 17 00:00:00 2001 From: Timmy Welch Date: Thu, 12 Sep 2024 11:42:29 -0700 Subject: [PATCH] Optimize iterating hashes --- BasicMap.go | 65 ++++++++++++++++++++++++++++++++++++++--------------- map.go | 35 ++++++++++++++++------------- 2 files changed, 66 insertions(+), 34 deletions(-) diff --git a/BasicMap.go b/BasicMap.go index 53ffed6..b61f930 100644 --- a/BasicMap.go +++ b/BasicMap.go @@ -1,8 +1,10 @@ package ch import ( + "cmp" "fmt" "math/bits" + "slices" "sync" "gitea.narnian.us/lordwelch/goimagehash" @@ -12,16 +14,21 @@ type basicMapStorage struct { hashMutex sync.RWMutex ids map[ID]*[]ID - hashes [3]map[uint64]*[]ID + hashes [3][]structHash +} + +type structHash struct { + hash uint64 + ids *[]ID } func (b *basicMapStorage) Atleast(hashKind goimagehash.Kind, maxDistance int, searchHash uint64) []Result { hashType := int(hashKind) - 1 matchingHashes := make([]Result, 0, 100) // hope that we don't need all of them - for storedHash, ids := range b.hashes[hashType] { - distance := bits.OnesCount64(searchHash ^ storedHash) + for _, storedHash := range b.hashes[hashType] { + distance := bits.OnesCount64(searchHash ^ storedHash.hash) if distance <= maxDistance { - matchingHashes = append(matchingHashes, Result{ToIDList(*ids), distance, Hash{storedHash, hashKind}}) + matchingHashes = append(matchingHashes, Result{ToIDList(*storedHash.ids), distance, Hash{storedHash.hash, hashKind}}) } } return matchingHashes @@ -31,25 +38,26 @@ func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([] b.hashMutex.RLock() defer b.hashMutex.RUnlock() resetTime() + defer logTime(fmt.Sprintf("Search Complete: max: %v ExactOnly: %v", max, exactOnly)) if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate for _, hash := range hashes { hashType := int(hash.Kind) - 1 - ids := b.hashes[hashType][hash.Hash] - if ids != nil && len(*ids) > 0 { + index, hashFound := b.findHash(hashType, hash.Hash) + if hashFound { foundMatches = append(foundMatches, Result{ Distance: 0, Hash: hash, - IDs: ToIDList(*ids), + IDs: ToIDList(*b.hashes[hashType][index].ids), }) } } + logTime("Search Exact") // If we have exact matches don't bother with other matches if len(foundMatches) > 0 && exactOnly { return foundMatches, nil } - logTime("Search Exact") } foundHashes := make(map[uint64]struct{}) @@ -66,28 +74,49 @@ func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([] } fmt.Println("Total partial hashes tested:", totalPartialHashes, len(foundHashes)) - logTime("Search Complete") go b.printSizes() return foundMatches, nil } +func (b *basicMapStorage) findHash(hashType int, hash uint64) (int, bool) { + return slices.BinarySearchFunc(b.hashes[hashType], hash, func(e structHash, t uint64) int { + return cmp.Compare(e.hash, t) + }) +} +func (b *basicMapStorage) InsertHash(hashType int, hash uint64, ids *[]ID) { + index, hashFound := b.findHash(hashType, hash) + if hashFound { + return + } + slices.Insert(b.hashes[hashType], index, structHash{hash, ids}) +} + func (b *basicMapStorage) MapHashes(hash ImageHash) { for _, ih := range hash.Hashes { var ( hashType = int(ih.Kind) - 1 ) + ids, ok := b.ids[hash.ID] + if !ok { + ids = &[]ID{hash.ID} + } - *b.hashes[hashType][ih.Hash] = InsertID((*b.hashes[hashType][ih.Hash]), hash.ID) + b.InsertHash(hashType, ih.Hash, ids) } } func (b *basicMapStorage) DecodeHashes(hashes SavedHashes) error { for hashType, sourceHashes := range hashes.Hashes { - b.hashes[hashType] = make(map[uint64]*[]ID, len(sourceHashes)) + b.hashes[hashType] = make([]structHash, len(sourceHashes)) for savedHash, idlistLocation := range sourceHashes { - b.hashes[hashType][savedHash] = &hashes.IDs[idlistLocation] + b.hashes[hashType] = append(b.hashes[hashType], structHash{savedHash, &hashes.IDs[idlistLocation]}) } } + for hashType := range b.hashes { + slices.SortFunc(b.hashes[hashType], func(a, b structHash) int { + return cmp.Compare(a.hash, b.hash) + }) + } b.printSizes() return nil } @@ -110,8 +139,8 @@ func (b *basicMapStorage) EncodeHashes() (SavedHashes, error) { idmap[ids] = len(hashes.IDs) } for hashType, hashToID := range b.hashes { - for hash, ids := range hashToID { - hashes.Hashes[hashType][hash] = idmap[ids] + for _, hash := range hashToID { + hashes.Hashes[hashType][hash.hash] = idmap[hash.ids] } } return hashes, nil @@ -141,10 +170,10 @@ func NewBasicMapStorage() (HashStorage, error) { storage := &basicMapStorage{ hashMutex: sync.RWMutex{}, - hashes: [3]map[uint64]*[]ID{ - make(map[uint64]*[]ID), - make(map[uint64]*[]ID), - make(map[uint64]*[]ID), + hashes: [3][]structHash{ + []structHash{}, + []structHash{}, + []structHash{}, }, } return storage, nil diff --git a/map.go b/map.go index 09615e2..09720ff 100644 --- a/map.go +++ b/map.go @@ -1,6 +1,7 @@ package ch import ( + "cmp" "fmt" "slices" "sync" @@ -16,25 +17,26 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul m.hashMutex.RLock() defer m.hashMutex.RUnlock() resetTime() + defer logTime("Search Complete") if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate for _, hash := range hashes { hashType := int(hash.Kind) - 1 - idlist := m.hashes[hashType][hash.Hash] - if idlist != nil && len(*idlist) > 0 { + index, hashFound := m.findHash(hashType, hash.Hash) + if hashFound { foundMatches = append(foundMatches, Result{ Distance: 0, Hash: hash, - IDs: ToIDList(*idlist), + IDs: ToIDList(*m.hashes[hashType][index].ids), }) } } // If we have exact matches don't bother with other matches + logTime("Search Exact") if len(foundMatches) > 0 && exactOnly { return foundMatches, nil } - logTime("Search Exact") } totalPartialHashes := 0 @@ -46,15 +48,14 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul totalPartialHashes += len(partialHashes) for _, match := range Atleast(max, searchHash.Hash, partialHashes) { _, alreadyMatched := foundHashes[match.Hash] - if matchedResults, ok := m.hashes[hashType][match.Hash]; ok && !alreadyMatched { + if index, hashFound := m.findHash(hashType, match.Hash); hashFound && !alreadyMatched { foundHashes[match.Hash] = struct{}{} - foundMatches = append(foundMatches, Result{IDs: ToIDList(*matchedResults), Distance: match.Distance, Hash: Hash{Hash: match.Hash, Kind: searchHash.Kind}}) + foundMatches = append(foundMatches, Result{IDs: ToIDList(*m.hashes[hashType][index].ids), Distance: match.Distance, Hash: Hash{Hash: match.Hash, Kind: searchHash.Kind}}) } } } } fmt.Println("Total partial hashes tested:", totalPartialHashes) - logTime("Search Complete") go m.printSizes() return foundMatches, nil } @@ -71,14 +72,16 @@ func (m *MapStorage) MapHashes(hash ImageHash) { func (m *MapStorage) DecodeHashes(hashes SavedHashes) error { for hashType, sourceHashes := range hashes.Hashes { - m.hashes[hashType] = make(map[uint64]*[]ID, len(sourceHashes)) + m.hashes[hashType] = make([]structHash, len(sourceHashes)) for savedHash, idlistLocation := range sourceHashes { - for i, partialHash := range SplitHash(savedHash) { - m.partialHash[hashType][i][partialHash] = append(m.partialHash[hashType][i][partialHash], savedHash) - } - m.hashes[hashType][savedHash] = &hashes.IDs[idlistLocation] + m.hashes[hashType] = append(m.hashes[hashType], structHash{savedHash, &hashes.IDs[idlistLocation]}) } } + for hashType := range m.hashes { + slices.SortFunc(m.hashes[hashType], func(a, b structHash) int { + return cmp.Compare(a.hash, b.hash) + }) + } m.printSizes() for _, partialHashes := range m.partialHash { for _, partMap := range partialHashes { @@ -104,10 +107,10 @@ func NewMapStorage() (HashStorage, error) { storage := &MapStorage{ basicMapStorage: basicMapStorage{ hashMutex: sync.RWMutex{}, - hashes: [3]map[uint64]*[]ID{ - make(map[uint64]*[]ID), - make(map[uint64]*[]ID), - make(map[uint64]*[]ID), + hashes: [3][]structHash{ + []structHash{}, + []structHash{}, + []structHash{}, }, }, partialHash: [3][8]map[uint8][]uint64{