comic-hasher/BasicMap.go

202 lines
5.1 KiB
Go
Raw Normal View History

package ch
import (
2024-09-12 11:42:29 -07:00
"cmp"
2024-10-14 02:02:26 -07:00
"errors"
"fmt"
"math/bits"
2024-09-12 11:42:29 -07:00
"slices"
"sync"
"gitea.narnian.us/lordwelch/goimagehash"
)
type basicMapStorage struct {
2024-10-16 17:56:19 -07:00
hashMutex *sync.RWMutex
ids map[ID]*[]ID
2024-09-12 11:42:29 -07:00
hashes [3][]structHash
}
type structHash struct {
hash uint64
ids *[]ID
}
func (b *basicMapStorage) Atleast(hashKind goimagehash.Kind, maxDistance int, searchHash uint64) []Result {
hashType := int(hashKind) - 1
matchingHashes := make([]Result, 0, 100) // hope that we don't need all of them
2024-10-16 17:56:19 -07:00
b.hashMutex.RLock()
defer b.hashMutex.RUnlock()
2024-09-12 11:42:29 -07:00
for _, storedHash := range b.hashes[hashType] {
distance := bits.OnesCount64(searchHash ^ storedHash.hash)
if distance <= maxDistance {
2024-09-12 11:42:29 -07:00
matchingHashes = append(matchingHashes, Result{ToIDList(*storedHash.ids), distance, Hash{storedHash.hash, hashKind}})
}
}
return matchingHashes
}
func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
var foundMatches []Result
resetTime()
2024-09-12 11:42:29 -07:00
defer logTime(fmt.Sprintf("Search Complete: max: %v ExactOnly: %v", max, exactOnly))
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
for _, hash := range hashes {
hashType := int(hash.Kind) - 1
2024-10-16 17:56:19 -07:00
b.hashMutex.RLock()
2024-09-12 11:42:29 -07:00
index, hashFound := b.findHash(hashType, hash.Hash)
if hashFound {
foundMatches = append(foundMatches, Result{
Distance: 0,
Hash: hash,
2024-09-12 11:42:29 -07:00
IDs: ToIDList(*b.hashes[hashType][index].ids),
})
}
2024-10-16 17:56:19 -07:00
b.hashMutex.RUnlock()
}
2024-09-12 11:42:29 -07:00
logTime("Search Exact")
// If we have exact matches don't bother with other matches
if len(foundMatches) > 0 && exactOnly {
return foundMatches, nil
}
}
foundHashes := make(map[uint64]struct{})
totalPartialHashes := 0
for _, hash := range hashes {
for _, match := range b.Atleast(hash.Kind, max, hash.Hash) {
_, alreadyMatched := foundHashes[match.Hash.Hash]
if alreadyMatched {
continue
}
foundHashes[match.Hash.Hash] = struct{}{}
foundMatches = append(foundMatches, match)
}
}
fmt.Println("Total partial hashes tested:", totalPartialHashes, len(foundHashes))
return foundMatches, nil
}
2024-10-16 17:56:19 -07:00
// findHash must have a read lock before using
2024-09-12 11:42:29 -07:00
func (b *basicMapStorage) findHash(hashType int, hash uint64) (int, bool) {
return slices.BinarySearchFunc(b.hashes[hashType], hash, func(e structHash, t uint64) int {
return cmp.Compare(e.hash, t)
})
}
2024-10-16 17:56:19 -07:00
// insertHash will take a write lock if the hash is not found
func (b *basicMapStorage) insertHash(hashType int, hash uint64, ids *[]ID) {
b.hashMutex.RLock()
2024-09-12 11:42:29 -07:00
index, hashFound := b.findHash(hashType, hash)
2024-10-16 17:56:19 -07:00
b.hashMutex.RUnlock()
2024-09-12 11:42:29 -07:00
if hashFound {
return
}
2024-10-16 17:56:19 -07:00
b.hashMutex.Lock()
2024-10-13 22:14:42 -07:00
b.hashes[hashType] = slices.Insert(b.hashes[hashType], index, structHash{hash, ids})
2024-10-16 17:56:19 -07:00
b.hashMutex.Unlock()
2024-09-12 11:42:29 -07:00
}
func (b *basicMapStorage) MapHashes(hash ImageHash) {
for _, ih := range hash.Hashes {
var (
hashType = int(ih.Kind) - 1
)
2024-10-16 17:56:19 -07:00
b.hashMutex.RLock()
2024-09-12 11:42:29 -07:00
ids, ok := b.ids[hash.ID]
2024-10-16 17:56:19 -07:00
b.hashMutex.RUnlock()
2024-09-12 11:42:29 -07:00
if !ok {
2024-10-16 17:56:19 -07:00
b.hashMutex.Lock()
2024-09-12 11:42:29 -07:00
ids = &[]ID{hash.ID}
2024-10-13 22:14:42 -07:00
b.ids[hash.ID] = ids
2024-10-16 17:56:19 -07:00
b.hashMutex.Unlock()
2024-09-12 11:42:29 -07:00
}
2024-10-16 17:56:19 -07:00
b.insertHash(hashType, ih.Hash, ids)
}
}
2024-10-16 17:56:19 -07:00
// DecodeHashes should already have a lock
func (b *basicMapStorage) DecodeHashes(hashes SavedHashes) error {
for hashType, sourceHashes := range hashes.Hashes {
2024-09-12 11:42:29 -07:00
b.hashes[hashType] = make([]structHash, len(sourceHashes))
for savedHash, idlistLocation := range sourceHashes {
2024-09-12 11:42:29 -07:00
b.hashes[hashType] = append(b.hashes[hashType], structHash{savedHash, &hashes.IDs[idlistLocation]})
2024-10-14 02:02:26 -07:00
for _, id := range hashes.IDs[idlistLocation] {
b.ids[id] = &hashes.IDs[idlistLocation]
}
}
}
2024-09-12 11:42:29 -07:00
for hashType := range b.hashes {
slices.SortFunc(b.hashes[hashType], func(a, b structHash) int {
return cmp.Compare(a.hash, b.hash)
})
}
return nil
}
2024-10-16 17:56:19 -07:00
// EncodeHashes should already have a lock
func (b *basicMapStorage) EncodeHashes() (SavedHashes, error) {
2024-10-13 22:14:42 -07:00
hashes := SavedHashes{
Hashes: [3]map[uint64]int{
make(map[uint64]int),
make(map[uint64]int),
make(map[uint64]int),
},
}
idmap := map[*[]ID]int{}
2024-10-13 22:14:42 -07:00
for _, ids := range b.ids {
if _, ok := idmap[ids]; ok {
continue
}
idmap[ids] = len(hashes.IDs)
2024-10-13 22:14:42 -07:00
hashes.IDs = append(hashes.IDs, *ids)
}
2024-10-13 22:14:42 -07:00
for hashType, hashToID := range b.hashes {
2024-09-12 11:42:29 -07:00
for _, hash := range hashToID {
hashes.Hashes[hashType][hash.hash] = idmap[hash.ids]
}
}
return hashes, nil
}
2024-10-14 02:02:26 -07:00
func (b *basicMapStorage) AssociateIDs(newids []NewIDs) error {
for _, newid := range newids {
2024-10-16 17:56:19 -07:00
b.hashMutex.RLock()
ids, found := b.ids[newid.OldID]
2024-10-16 17:56:19 -07:00
b.hashMutex.RUnlock()
if !found {
2024-10-14 02:02:26 -07:00
msg := "No IDs belonging to " + string(newid.OldID.Domain) + " exist on this server"
return errors.New(msg)
}
2024-10-16 17:56:19 -07:00
b.hashMutex.Lock()
*ids = InsertID(*ids, newid.NewID)
2024-10-16 17:56:19 -07:00
b.hashMutex.Unlock()
}
2024-10-14 02:02:26 -07:00
return nil
}
func (b *basicMapStorage) GetIDs(id ID) IDList {
2024-10-16 17:56:19 -07:00
b.hashMutex.RLock()
defer b.hashMutex.RUnlock()
ids, found := b.ids[id]
if !found {
2024-10-14 02:02:26 -07:00
return nil
}
return ToIDList(*ids)
}
func NewBasicMapStorage() (HashStorage, error) {
storage := &basicMapStorage{
2024-10-16 17:56:19 -07:00
hashMutex: &sync.RWMutex{},
2024-10-13 22:14:42 -07:00
ids: make(map[ID]*[]ID),
hashes: [3][]structHash{},
}
return storage, nil
}