comic-hasher/BasicMap.go
2025-02-22 13:45:41 -08:00

264 lines
6.8 KiB
Go

package ch
import (
"cmp"
"errors"
"fmt"
"math/bits"
"slices"
"sync"
"gitea.narnian.us/lordwelch/goimagehash"
)
type basicMapStorage struct {
hashMutex *sync.RWMutex
ids map[ID]*[]ID
aHashes []SavedHash
dHashes []SavedHash
pHashes []SavedHash
}
// atleast must have a read lock before using
func (b *basicMapStorage) atleast(kind goimagehash.Kind, maxDistance int, searchHash uint64) []Result {
matchingHashes := make([]Result, 0, 20) // hope that we don't need more
mappedIds := map[*[]ID]bool{}
for _, storedHash := range *b.getCurrentHashes(kind) {
distance := bits.OnesCount64(searchHash ^ storedHash.Hash.Hash)
if distance <= maxDistance {
ids := b.ids[storedHash.ID]
if mappedIds[ids] {
continue
}
mappedIds[ids] = true
matchingHashes = append(matchingHashes, Result{ToIDList(*b.ids[storedHash.ID]), distance, storedHash.Hash})
}
}
return matchingHashes
}
func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
var (
foundMatches []Result
tl timeLog
)
tl.resetTime()
defer tl.logTime(fmt.Sprintf("Search Complete: max: %v ExactOnly: %v", max, exactOnly))
b.hashMutex.RLock()
defer b.hashMutex.RUnlock()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
for _, hash := range hashes {
mappedIds := map[*[]ID]bool{}
index, count := b.findHash(hash)
if count > 0 {
for _, storedHash := range (*b.getCurrentHashes(hash.Kind))[index : index+count] {
ids := b.ids[storedHash.ID]
if mappedIds[ids] {
continue
}
mappedIds[ids] = true
foundMatches = append(foundMatches, Result{
Distance: 0,
Hash: storedHash.Hash,
IDs: ToIDList(*b.ids[storedHash.ID]),
})
}
}
}
tl.logTime("Search Exact")
return foundMatches, nil
}
foundHashes := make(map[uint64]struct{})
totalPartialHashes := 0
for _, hash := range hashes {
foundMatches = append(foundMatches, b.atleast(hash.Kind, max, hash.Hash)...)
}
fmt.Println("Total partial hashes tested:", totalPartialHashes, len(foundHashes))
return foundMatches, nil
}
// getCurrentHashes must have a read lock before using
func (b *basicMapStorage) getCurrentHashes(kind goimagehash.Kind) *[]SavedHash {
if kind == goimagehash.AHash {
return &b.aHashes
}
if kind == goimagehash.DHash {
return &b.dHashes
}
if kind == goimagehash.PHash {
return &b.pHashes
}
panic("Unknown hash type: " + kind.String())
}
// findHash must have a read lock before using
// return value is index, count
// if count < 1 then no results were found
func (b *basicMapStorage) findHash(hash Hash) (int, int) {
currentHashes := *b.getCurrentHashes(hash.Kind)
index, found := slices.BinarySearchFunc(currentHashes, hash, func(existing SavedHash, target Hash) int {
return cmp.Compare(existing.Hash.Hash, target.Hash)
})
if !found {
return index, 0
}
count := 0
for i := index + 1; i < len(currentHashes) && currentHashes[i].Hash.Hash == hash.Hash; i++ {
count++
}
return index, count
}
// insertHash must already have a lock
func (b *basicMapStorage) insertHash(hash Hash, id ID) {
currentHashes := b.getCurrentHashes(hash.Kind)
index, count := b.findHash(hash)
max := index + count
for ; index < max; index++ {
if (*currentHashes)[index].ID == id {
return
}
}
*currentHashes = slices.Insert(*currentHashes, index, SavedHash{hash, id})
if _, mapped := b.ids[id]; !mapped {
b.ids[id] = &[]ID{id}
}
}
func (b *basicMapStorage) MapHashes(hash ImageHash) {
b.hashMutex.Lock()
defer b.hashMutex.Unlock()
for _, ih := range hash.Hashes {
b.insertHash(ih, hash.ID)
}
}
// DecodeHashes must already have a lock
func (b *basicMapStorage) DecodeHashes(hashes SavedHashes) error {
b.ids = make(map[ID]*[]ID, len(hashes.Hashes))
// Initialize all the known equal IDs
for _, ids := range hashes.IDs {
for _, id := range ids {
b.ids[id] = &ids
}
}
slices.SortFunc(hashes.Hashes, func(existing, target SavedHash) int {
return cmp.Or(
cmp.Compare(existing.Hash.Kind, target.Hash.Kind),
cmp.Compare(existing.Hash.Hash, target.Hash.Hash),
cmp.Compare(existing.ID.Domain, target.ID.Domain),
cmp.Compare(existing.ID.ID, target.ID.ID),
)
})
// Assume they are probably fairly equally split between hash types
b.aHashes = make([]SavedHash, 0, len(hashes.Hashes)/3)
b.dHashes = make([]SavedHash, 0, len(hashes.Hashes)/3)
b.pHashes = make([]SavedHash, 0, len(hashes.Hashes)/3)
for _, savedHash := range hashes.Hashes {
if savedHash.Hash.Kind == goimagehash.AHash {
b.aHashes = append(b.aHashes, savedHash)
}
if savedHash.Hash.Kind == goimagehash.DHash {
b.dHashes = append(b.dHashes, savedHash)
}
if savedHash.Hash.Kind == goimagehash.PHash {
b.pHashes = append(b.pHashes, savedHash)
}
if savedHash.ID == (ID{}) {
fmt.Println("Empty ID detected")
panic(savedHash)
}
// All known equal IDs are already mapped we can add any missing ones from hashes
if _, ok := b.ids[savedHash.ID]; !ok {
b.ids[savedHash.ID] = &[]ID{savedHash.ID}
}
}
hashCmp := func(existing, target SavedHash) int {
return cmp.Or(
cmp.Compare(existing.Hash.Hash, target.Hash.Hash),
cmp.Compare(existing.ID.Domain, target.ID.Domain),
cmp.Compare(existing.ID.ID, target.ID.ID),
)
}
slices.SortFunc(b.aHashes, hashCmp)
slices.SortFunc(b.dHashes, hashCmp)
slices.SortFunc(b.pHashes, hashCmp)
return nil
}
// EncodeHashes should already have a lock
func (b *basicMapStorage) EncodeHashes() (SavedHashes, error) {
savedHashes := SavedHashes{
Hashes: make([]SavedHash, 0, len(b.aHashes)+len(b.dHashes)+len(b.pHashes)),
}
// Only keep groups >1 as they will be mapped in SavedHashes.Hashes
for _, ids := range b.ids {
if len(*ids) > 1 {
savedHashes.IDs = append(savedHashes.IDs, *ids)
}
}
savedHashes.Hashes = append(savedHashes.Hashes, b.aHashes...)
savedHashes.Hashes = append(savedHashes.Hashes, b.dHashes...)
savedHashes.Hashes = append(savedHashes.Hashes, b.pHashes...)
return savedHashes, nil
}
func (b *basicMapStorage) AssociateIDs(newids []NewIDs) error {
for _, newid := range newids {
b.hashMutex.RLock()
ids, found := b.ids[newid.OldID]
b.hashMutex.RUnlock()
if !found {
msg := "ID not found on this server"
return errors.New(msg)
}
b.hashMutex.Lock()
*ids = InsertID(*ids, newid.NewID)
b.hashMutex.Unlock()
}
return nil
}
func (b *basicMapStorage) GetIDs(id ID) IDList {
b.hashMutex.RLock()
defer b.hashMutex.RUnlock()
ids, found := b.ids[id]
if !found {
return nil
}
return ToIDList(*ids)
}
func NewBasicMapStorage() (HashStorage, error) {
storage := &basicMapStorage{
hashMutex: &sync.RWMutex{},
ids: make(map[ID]*[]ID),
aHashes: []SavedHash{},
dHashes: []SavedHash{},
pHashes: []SavedHash{},
}
return storage, nil
}