comic-hasher/BasicMap.go
Timmy Welch 0928ed6ccf Optimize memory usage
Add a basic map storage that does manual searches to conserve memory
Change saved hashes format to allow multiple hashes for a given ID
Add a vptree storage

Maps in Go take up a huge amount of space changing IDList to []ID took
  memory from over 1GB down to 200MB (note this was on aarch64 MacOS
  which for some reason uses less memory than aarch64 Linux).
  Exhaustive searches using slices took about 30 ms search now takes
  50-60 ms as it takes longer to iterate a map. Partial hashes will
  speed up searches to 8 ms at the cost of 700MB initial memory usage
  and 400MB idle (though this is on MacOS, which for some reason uses
  less memory that aarch64 Linux so probably more like
  900MB initial -> 600 MB idle on an RPI running Linux)
2024-09-07 14:51:18 -07:00

152 lines
3.9 KiB
Go

package ch
import (
"fmt"
"math/bits"
"sync"
"gitea.narnian.us/lordwelch/goimagehash"
)
type basicMapStorage struct {
hashMutex sync.RWMutex
ids map[ID]*[]ID
hashes [3]map[uint64]*[]ID
}
func (b *basicMapStorage) Atleast(hashKind goimagehash.Kind, maxDistance int, searchHash uint64) []Result {
hashType := int(hashKind) - 1
matchingHashes := make([]Result, 0, 100) // hope that we don't need all of them
for storedHash, ids := range b.hashes[hashType] {
distance := bits.OnesCount64(searchHash ^ storedHash)
if distance <= maxDistance {
matchingHashes = append(matchingHashes, Result{ToIDList(*ids), distance, Hash{storedHash, hashKind}})
}
}
return matchingHashes
}
func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
var foundMatches []Result
b.hashMutex.RLock()
defer b.hashMutex.RUnlock()
resetTime()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
for _, hash := range hashes {
hashType := int(hash.Kind) - 1
ids := b.hashes[hashType][hash.Hash]
if ids != nil && len(*ids) > 0 {
foundMatches = append(foundMatches, Result{
Distance: 0,
Hash: hash,
IDs: ToIDList(*ids),
})
}
}
// If we have exact matches don't bother with other matches
if len(foundMatches) > 0 && exactOnly {
return foundMatches, nil
}
logTime("Search Exact")
}
foundHashes := make(map[uint64]struct{})
totalPartialHashes := 0
for _, hash := range hashes {
for _, match := range b.Atleast(hash.Kind, max, hash.Hash) {
_, alreadyMatched := foundHashes[match.Hash.Hash]
if alreadyMatched {
continue
}
foundHashes[match.Hash.Hash] = struct{}{}
foundMatches = append(foundMatches, match)
}
}
fmt.Println("Total partial hashes tested:", totalPartialHashes, len(foundHashes))
logTime("Search Complete")
go b.printSizes()
return foundMatches, nil
}
func (b *basicMapStorage) MapHashes(hash ImageHash) {
for _, ih := range hash.Hashes {
var (
hashType = int(ih.Kind) - 1
)
*b.hashes[hashType][ih.Hash] = InsertID((*b.hashes[hashType][ih.Hash]), hash.ID)
}
}
func (b *basicMapStorage) DecodeHashes(hashes SavedHashes) error {
for hashType, sourceHashes := range hashes.Hashes {
b.hashes[hashType] = make(map[uint64]*[]ID, len(sourceHashes))
for savedHash, idlistLocation := range sourceHashes {
b.hashes[hashType][savedHash] = &hashes.IDs[idlistLocation]
}
}
b.printSizes()
return nil
}
func (b *basicMapStorage) printSizes() {
// fmt.Println("Size of", "hashes:", size.Of(b.hashes)/1024/1024, "MB")
// fmt.Println("Size of", "ids:", size.Of(b.ids)/1024/1024, "MB")
// fmt.Println("Size of", "basicMapStorage:", size.Of(b)/1024/1024, "MB")
}
func (b *basicMapStorage) EncodeHashes() (SavedHashes, error) {
hashes := SavedHashes{}
idmap := map[*[]ID]int{}
for _, ids := range b.ids {
if _, ok := idmap[ids]; ok {
continue
}
hashes.IDs = append(hashes.IDs, *ids)
idmap[ids] = len(hashes.IDs)
}
for hashType, hashToID := range b.hashes {
for hash, ids := range hashToID {
hashes.Hashes[hashType][hash] = idmap[ids]
}
}
return hashes, nil
}
func (b *basicMapStorage) AssociateIDs(newids []NewIDs) {
for _, newid := range newids {
ids, found := b.ids[newid.OldID]
if !found {
msg := "No IDs belonging to " + newid.OldID.Domain + "exist on this server"
panic(msg)
}
*ids = InsertID(*ids, newid.NewID)
}
}
func (b *basicMapStorage) GetIDs(id ID) IDList {
ids, found := b.ids[id]
if !found {
msg := "No IDs belonging to " + id.Domain + "exist on this server"
panic(msg)
}
return ToIDList(*ids)
}
func NewBasicMapStorage() (HashStorage, error) {
storage := &basicMapStorage{
hashMutex: sync.RWMutex{},
hashes: [3]map[uint64]*[]ID{
make(map[uint64]*[]ID),
make(map[uint64]*[]ID),
make(map[uint64]*[]ID),
},
}
return storage, nil
}