Compare commits

..

1 Commits

Author SHA1 Message Date
22d59aa221 Move HashStorage to its own package 2025-05-31 19:00:40 -07:00
10 changed files with 266 additions and 245 deletions

View File

@ -35,6 +35,7 @@ import (
ch "gitea.narnian.us/lordwelch/comic-hasher" ch "gitea.narnian.us/lordwelch/comic-hasher"
"gitea.narnian.us/lordwelch/comic-hasher/cv" "gitea.narnian.us/lordwelch/comic-hasher/cv"
"gitea.narnian.us/lordwelch/comic-hasher/storage"
) )
var bufPool = &sync.Pool{ var bufPool = &sync.Pool{
@ -215,15 +216,15 @@ func signalHandler(s *Server) {
func initializeStorage(opts Opts) (ch.HashStorage, error) { func initializeStorage(opts Opts) (ch.HashStorage, error) {
switch opts.storageType { switch opts.storageType {
case Map: case Map:
return ch.NewMapStorage() return storage.NewMapStorage()
case BasicMap: case BasicMap:
return ch.NewBasicMapStorage() return storage.NewBasicMapStorage()
case Sqlite: case Sqlite:
return ch.NewSqliteStorage("sqlite", opts.sqlitePath) return storage.NewSqliteStorage("sqlite", opts.sqlitePath)
case Sqlite3: case Sqlite3:
return ch.NewSqliteStorage("sqlite3", opts.sqlitePath) return storage.NewSqliteStorage("sqlite3", opts.sqlitePath)
case VPTree: case VPTree:
return ch.NewVPStorage() return storage.NewVPStorage()
} }
return nil, errors.New("Unknown storage type provided") return nil, errors.New("Unknown storage type provided")
} }

View File

@ -83,6 +83,19 @@ func (f *Format) Set(s string) error {
return nil return nil
} }
func (h *SavedHash) Clone() SavedHash {
return SavedHash{
Hash: Hash{
Hash: h.Hash.Hash,
Kind: h.Hash.Kind,
},
ID: ID{
Domain: NewSource(*h.ID.Domain),
ID: strings.Clone(h.ID.ID),
},
}
}
func (s *SavedHashes) InsertHash(hash SavedHash) { func (s *SavedHashes) InsertHash(hash SavedHash) {
index, itemFound := slices.BinarySearchFunc(s.Hashes, hash, func(existing SavedHash, target SavedHash) int { index, itemFound := slices.BinarySearchFunc(s.Hashes, hash, func(existing SavedHash, target SavedHash) int {
return cmp.Or( return cmp.Or(

View File

@ -1,4 +1,4 @@
package ch package storage
import ( import (
"cmp" "cmp"
@ -6,49 +6,34 @@ import (
"fmt" "fmt"
"math/bits" "math/bits"
"slices" "slices"
"strings"
"sync" "sync"
ch "gitea.narnian.us/lordwelch/comic-hasher"
"gitea.narnian.us/lordwelch/goimagehash" "gitea.narnian.us/lordwelch/goimagehash"
) )
type bmHash struct {
Hash Hash
ID ID
}
func NewbmHash(data SavedHash) bmHash {
return bmHash{
Hash: Hash{
Hash: data.Hash.Hash,
Kind: data.Hash.Kind,
},
ID: ID{
Domain: data.ID.Domain,
ID: strings.Clone(data.ID.ID),
},
}
}
type basicMapStorage struct { type basicMapStorage struct {
hashMutex *sync.RWMutex hashMutex *sync.RWMutex
ids IDMap ids IDMap
aHashes []bmHash aHashes []ch.SavedHash
dHashes []bmHash dHashes []ch.SavedHash
pHashes []bmHash pHashes []ch.SavedHash
} }
type IDs struct { type IDs struct {
id *ID id *ch.ID
idList *[]*ID idList *[]*ch.ID
} }
type IDMap struct { type IDMap struct {
ids []IDs ids []IDs
} }
func (m *IDMap) InsertID(id *ID) *ID { func (m *IDMap) InsertID(id *ch.ID) *ch.ID {
return m.insertID(id, &[]*ID{id}) return m.insertID(id, &[]*ch.ID{id})
} }
func (m *IDMap) insertID(id *ID, idList *[]*ID) *ID { func (m *IDMap) insertID(id *ch.ID, idList *[]*ch.ID) *ch.ID {
index, found := slices.BinarySearchFunc(m.ids, id, func(id IDs, target *ID) int { index, found := slices.BinarySearchFunc(m.ids, id, func(id IDs, target *ch.ID) int {
return id.id.Compare(*target) return id.id.Compare(*target)
}) })
if !found { if !found {
@ -66,40 +51,40 @@ func (m *IDMap) sort() {
}) })
} }
func (m *IDMap) FindID(id *ID) (int, bool) { func (m *IDMap) FindID(id *ch.ID) (int, bool) {
return slices.BinarySearchFunc(m.ids, id, func(id IDs, target *ID) int { return slices.BinarySearchFunc(m.ids, id, func(id IDs, target *ch.ID) int {
return id.id.Compare(*target) return id.id.Compare(*target)
}) })
} }
func (m *IDMap) GetIDs(id *ID) []ID { func (m *IDMap) GetIDs(id *ch.ID) []ch.ID {
index, found := m.FindID(id) index, found := m.FindID(id)
if !found { if !found {
return nil return nil
} }
ids := make([]ID, 0, len(*m.ids[index].idList)) ids := make([]ch.ID, 0, len(*m.ids[index].idList))
for _, id := range *m.ids[index].idList { for _, id := range *m.ids[index].idList {
ids = append(ids, *id) ids = append(ids, *id)
} }
return ids return ids
} }
func (m *IDMap) AssociateIDs(newids []NewIDs) error { func (m *IDMap) AssociateIDs(newids []ch.NewIDs) error {
for _, newid := range newids { for _, newid := range newids {
index, found := m.FindID(&newid.OldID) index, found := m.FindID(&newid.OldID)
if !found { if !found {
return ErrIDNotFound return ErrIDNotFound
} }
*(m.ids[index].idList) = InsertIDp(*(m.ids[index].idList), &newid.NewID) *(m.ids[index].idList) = ch.InsertIDp(*(m.ids[index].idList), &newid.NewID)
m.insertID(&newid.NewID, m.ids[index].idList) m.insertID(&newid.NewID, m.ids[index].idList)
} }
return nil return nil
} }
// func (m *IDMap) NewID(domain Source, id string) *ID { // func (m *IDMap) NewID(domain Source, id string) *ch.ID {
// newID := ID{domain, id} // newID := ch.ID{domain, id}
// index, found := slices.BinarySearchFunc(m.idList, newID, func(id *ID, target ID) int { // index, found := slices.BinarySearchFunc(m.idList, newID, func(id *ch.ID, target ch.ID) int {
// return id.Compare(*target) // return id.Compare(*target)
// }) // })
// if !found { // if !found {
@ -111,11 +96,11 @@ func (m *IDMap) AssociateIDs(newids []NewIDs) error {
var ErrIDNotFound = errors.New("ID not found on this server") var ErrIDNotFound = errors.New("ID not found on this server")
// atleast must have a read lock before using // atleast must have a read lock before using
func (b *basicMapStorage) atleast(kind goimagehash.Kind, maxDistance int, searchHash uint64) []Result { func (b *basicMapStorage) atleast(kind goimagehash.Kind, maxDistance int, searchHash uint64) []ch.Result {
matchingHashes := make([]Result, 0, 20) // hope that we don't need more matchingHashes := make([]ch.Result, 0, 20) // hope that we don't need more
mappedIds := map[int]bool{} mappedIds := map[int]bool{}
storedHash := bmHash{} // reduces allocations and ensures queries are <1s storedHash := ch.SavedHash{} // reduces allocations and ensures queries are <1s
for _, storedHash = range *b.getCurrentHashes(kind) { for _, storedHash = range *b.getCurrentHashes(kind) {
distance := bits.OnesCount64(searchHash ^ storedHash.Hash.Hash) distance := bits.OnesCount64(searchHash ^ storedHash.Hash.Hash)
if distance <= maxDistance { if distance <= maxDistance {
@ -124,7 +109,7 @@ func (b *basicMapStorage) atleast(kind goimagehash.Kind, maxDistance int, search
continue continue
} }
mappedIds[index] = true mappedIds[index] = true
matchingHashes = append(matchingHashes, Result{ matchingHashes = append(matchingHashes, ch.Result{
Hash: storedHash.Hash, Hash: storedHash.Hash,
ID: storedHash.ID, ID: storedHash.ID,
Distance: distance, Distance: distance,
@ -135,8 +120,8 @@ func (b *basicMapStorage) atleast(kind goimagehash.Kind, maxDistance int, search
return matchingHashes return matchingHashes
} }
func (b *basicMapStorage) exactMatches(hashes []Hash, max int) []Result { func (b *basicMapStorage) exactMatches(hashes []ch.Hash, max int) []ch.Result {
var foundMatches []Result var foundMatches []ch.Result
for _, hash := range hashes { for _, hash := range hashes {
mappedIds := map[int]bool{} mappedIds := map[int]bool{}
@ -149,7 +134,7 @@ func (b *basicMapStorage) exactMatches(hashes []Hash, max int) []Result {
} }
mappedIds[index] = true mappedIds[index] = true
foundMatches = append(foundMatches, Result{ foundMatches = append(foundMatches, ch.Result{
Hash: storedHash.Hash, Hash: storedHash.Hash,
ID: storedHash.ID, ID: storedHash.ID,
Distance: 0, Distance: 0,
@ -162,20 +147,20 @@ func (b *basicMapStorage) exactMatches(hashes []Hash, max int) []Result {
return foundMatches return foundMatches
} }
func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) { func (b *basicMapStorage) GetMatches(hashes []ch.Hash, max int, exactOnly bool) ([]ch.Result, error) {
var ( var (
foundMatches []Result foundMatches []ch.Result
tl timeLog tl ch.TimeLog
) )
tl.resetTime() tl.ResetTime()
defer tl.logTime(fmt.Sprintf("Search Complete: max: %v ExactOnly: %v", max, exactOnly)) defer tl.LogTime(fmt.Sprintf("Search Complete: max: %v ExactOnly: %v", max, exactOnly))
b.hashMutex.RLock() b.hashMutex.RLock()
defer b.hashMutex.RUnlock() defer b.hashMutex.RUnlock()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
foundMatches = b.exactMatches(hashes, max) foundMatches = b.exactMatches(hashes, max)
tl.logTime("Search Exact") tl.LogTime("Search Exact")
if len(foundMatches) > 0 { if len(foundMatches) > 0 {
return foundMatches, nil return foundMatches, nil
} }
@ -193,7 +178,7 @@ func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]
} }
// getCurrentHashes must have a read lock before using // getCurrentHashes must have a read lock before using
func (b *basicMapStorage) getCurrentHashes(kind goimagehash.Kind) *[]bmHash { func (b *basicMapStorage) getCurrentHashes(kind goimagehash.Kind) *[]ch.SavedHash {
if kind == goimagehash.AHash { if kind == goimagehash.AHash {
return &b.aHashes return &b.aHashes
} }
@ -209,9 +194,9 @@ func (b *basicMapStorage) getCurrentHashes(kind goimagehash.Kind) *[]bmHash {
// findHash must have a read lock before using // findHash must have a read lock before using
// return value is index, count // return value is index, count
// if count < 1 then no results were found // if count < 1 then no results were found
func (b *basicMapStorage) findHash(hash Hash) (int, int) { func (b *basicMapStorage) findHash(hash ch.Hash) (int, int) {
currentHashes := *b.getCurrentHashes(hash.Kind) currentHashes := *b.getCurrentHashes(hash.Kind)
index, found := slices.BinarySearchFunc(currentHashes, hash, func(existing bmHash, target Hash) int { index, found := slices.BinarySearchFunc(currentHashes, hash, func(existing ch.SavedHash, target ch.Hash) int {
return cmp.Compare(existing.Hash.Hash, target.Hash) return cmp.Compare(existing.Hash.Hash, target.Hash)
}) })
if !found { if !found {
@ -225,7 +210,7 @@ func (b *basicMapStorage) findHash(hash Hash) (int, int) {
} }
// insertHash must already have a lock // insertHash must already have a lock
func (b *basicMapStorage) insertHash(hash Hash, id ID) { func (b *basicMapStorage) insertHash(hash ch.Hash, id ch.ID) {
currentHashes := b.getCurrentHashes(hash.Kind) currentHashes := b.getCurrentHashes(hash.Kind)
index, count := b.findHash(hash) index, count := b.findHash(hash)
max := index + count max := index + count
@ -235,12 +220,15 @@ func (b *basicMapStorage) insertHash(hash Hash, id ID) {
} }
} }
sh := bmHash{hash, id} sh := ch.SavedHash{
Hash: hash,
ID: id,
}
*currentHashes = slices.Insert(*currentHashes, index, sh) *currentHashes = slices.Insert(*currentHashes, index, sh)
b.ids.InsertID(&sh.ID) b.ids.InsertID(&sh.ID)
} }
func (b *basicMapStorage) MapHashes(hash ImageHash) { func (b *basicMapStorage) MapHashes(hash ch.ImageHash) {
b.hashMutex.Lock() b.hashMutex.Lock()
defer b.hashMutex.Unlock() defer b.hashMutex.Unlock()
for _, ih := range hash.Hashes { for _, ih := range hash.Hashes {
@ -249,7 +237,7 @@ func (b *basicMapStorage) MapHashes(hash ImageHash) {
} }
// DecodeHashes must already have a lock // DecodeHashes must already have a lock
func (b *basicMapStorage) DecodeHashes(hashes *SavedHashes) error { func (b *basicMapStorage) DecodeHashes(hashes *ch.SavedHashes) error {
if hashes == nil { if hashes == nil {
return nil return nil
} }
@ -257,7 +245,7 @@ func (b *basicMapStorage) DecodeHashes(hashes *SavedHashes) error {
// Initialize all the known equal IDs // Initialize all the known equal IDs
for _, ids := range hashes.IDs { for _, ids := range hashes.IDs {
new_ids := make([]*ID, 0, len(ids)) new_ids := make([]*ch.ID, 0, len(ids))
for _, id := range ids { for _, id := range ids {
new_ids = append(new_ids, &id) new_ids = append(new_ids, &id)
} }
@ -270,7 +258,7 @@ func (b *basicMapStorage) DecodeHashes(hashes *SavedHashes) error {
} }
b.ids.sort() b.ids.sort()
slices.SortFunc(hashes.Hashes, func(existing, target SavedHash) int { slices.SortFunc(hashes.Hashes, func(existing, target ch.SavedHash) int {
return cmp.Or( return cmp.Or(
cmp.Compare(*existing.ID.Domain, *target.ID.Domain), // Sorted for id insertion efficiency cmp.Compare(*existing.ID.Domain, *target.ID.Domain), // Sorted for id insertion efficiency
cmp.Compare(existing.ID.ID, target.ID.ID), // Sorted for id insertion efficiency cmp.Compare(existing.ID.ID, target.ID.ID), // Sorted for id insertion efficiency
@ -295,31 +283,31 @@ func (b *basicMapStorage) DecodeHashes(hashes *SavedHashes) error {
} }
// Assume they are probably fairly equally split between hash types // Assume they are probably fairly equally split between hash types
b.aHashes = make([]bmHash, 0, aHashCount) b.aHashes = make([]ch.SavedHash, 0, aHashCount)
b.dHashes = make([]bmHash, 0, dHashCount) b.dHashes = make([]ch.SavedHash, 0, dHashCount)
b.pHashes = make([]bmHash, 0, pHashCount) b.pHashes = make([]ch.SavedHash, 0, pHashCount)
for i := range hashes.Hashes { for i := range hashes.Hashes {
bmhash := NewbmHash(hashes.Hashes[i]) hash := hashes.Hashes[i].Clone() // Not cloning this will keep strings/slices loaded from json wasting memory
if hashes.Hashes[i].Hash.Kind == goimagehash.AHash { if hashes.Hashes[i].Hash.Kind == goimagehash.AHash {
b.aHashes = append(b.aHashes, bmhash) b.aHashes = append(b.aHashes, hash)
} }
if hashes.Hashes[i].Hash.Kind == goimagehash.DHash { if hashes.Hashes[i].Hash.Kind == goimagehash.DHash {
b.dHashes = append(b.dHashes, bmhash) b.dHashes = append(b.dHashes, hash)
} }
if hashes.Hashes[i].Hash.Kind == goimagehash.PHash { if hashes.Hashes[i].Hash.Kind == goimagehash.PHash {
b.pHashes = append(b.pHashes, bmhash) b.pHashes = append(b.pHashes, hash)
} }
if hashes.Hashes[i].ID == (ID{}) { if hashes.Hashes[i].ID == (ch.ID{}) {
fmt.Println("Empty ID detected") fmt.Println("Empty ID detected")
panic(hashes.Hashes[i]) panic(hashes.Hashes[i])
} }
// TODO: Make loading this more efficient // TODO: Make loading this more efficient
// All known equal IDs are already mapped we can add any missing ones from hashes // All known equal IDs are already mapped we can add any missing ones from hashes
b.ids.InsertID(&bmhash.ID) b.ids.InsertID(&hash.ID)
} }
hashCmp := func(existing, target bmHash) int { hashCmp := func(existing, target ch.SavedHash) int {
return cmp.Or( return cmp.Or(
cmp.Compare(existing.Hash.Hash, target.Hash.Hash), cmp.Compare(existing.Hash.Hash, target.Hash.Hash),
cmp.Compare(*existing.ID.Domain, *target.ID.Domain), cmp.Compare(*existing.ID.Domain, *target.ID.Domain),
@ -334,9 +322,9 @@ func (b *basicMapStorage) DecodeHashes(hashes *SavedHashes) error {
} }
// EncodeHashes should already have a lock // EncodeHashes should already have a lock
func (b *basicMapStorage) EncodeHashes() (*SavedHashes, error) { func (b *basicMapStorage) EncodeHashes() (*ch.SavedHashes, error) {
savedHashes := SavedHashes{ savedHashes := ch.SavedHashes{
Hashes: make([]SavedHash, 0, len(b.aHashes)+len(b.dHashes)+len(b.pHashes)), Hashes: make([]ch.SavedHash, 0, len(b.aHashes)+len(b.dHashes)+len(b.pHashes)),
} }
// savedHashes.Hashes = append(savedHashes.Hashes, b.aHashes...) // savedHashes.Hashes = append(savedHashes.Hashes, b.aHashes...)
// savedHashes.Hashes = append(savedHashes.Hashes, b.dHashes...) // savedHashes.Hashes = append(savedHashes.Hashes, b.dHashes...)
@ -357,28 +345,28 @@ func (b *basicMapStorage) EncodeHashes() (*SavedHashes, error) {
return &savedHashes, nil return &savedHashes, nil
} }
func (b *basicMapStorage) AssociateIDs(newids []NewIDs) error { func (b *basicMapStorage) AssociateIDs(newids []ch.NewIDs) error {
b.hashMutex.RLock() b.hashMutex.RLock()
defer b.hashMutex.RUnlock() defer b.hashMutex.RUnlock()
return b.ids.AssociateIDs(newids) return b.ids.AssociateIDs(newids)
} }
func (b *basicMapStorage) GetIDs(id ID) IDList { func (b *basicMapStorage) GetIDs(id ch.ID) ch.IDList {
b.hashMutex.RLock() b.hashMutex.RLock()
defer b.hashMutex.RUnlock() defer b.hashMutex.RUnlock()
ids := b.ids.GetIDs(&id) ids := b.ids.GetIDs(&id)
return ToIDList(ids) return ch.ToIDList(ids)
} }
func NewBasicMapStorage() (HashStorage, error) { func NewBasicMapStorage() (ch.HashStorage, error) {
storage := &basicMapStorage{ storage := &basicMapStorage{
hashMutex: &sync.RWMutex{}, hashMutex: &sync.RWMutex{},
ids: IDMap{ ids: IDMap{
ids: []IDs{}, ids: []IDs{},
}, },
aHashes: []bmHash{}, aHashes: []ch.SavedHash{},
dHashes: []bmHash{}, dHashes: []ch.SavedHash{},
pHashes: []bmHash{}, pHashes: []ch.SavedHash{},
} }
return storage, nil return storage, nil
} }

View File

@ -1,10 +1,11 @@
package ch package storage
import ( import (
"fmt" "fmt"
"slices" "slices"
"sync" "sync"
ch "gitea.narnian.us/lordwelch/comic-hasher"
"gitea.narnian.us/lordwelch/goimagehash" "gitea.narnian.us/lordwelch/goimagehash"
) )
@ -15,10 +16,10 @@ type MapStorage struct {
partialPHash [8]map[uint8][]uint64 partialPHash [8]map[uint8][]uint64
} }
func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) { func (m *MapStorage) GetMatches(hashes []ch.Hash, max int, exactOnly bool) ([]ch.Result, error) {
var ( var (
foundMatches []Result foundMatches []ch.Result
tl timeLog tl ch.TimeLog
) )
m.hashMutex.RLock() m.hashMutex.RLock()
defer m.hashMutex.RUnlock() defer m.hashMutex.RUnlock()
@ -26,13 +27,13 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
foundMatches = m.exactMatches(hashes, max) foundMatches = m.exactMatches(hashes, max)
tl.logTime("Search Exact") tl.LogTime("Search Exact")
if len(foundMatches) > 0 { if len(foundMatches) > 0 {
return foundMatches, nil return foundMatches, nil
} }
} }
tl.resetTime() tl.ResetTime()
defer tl.logTime("Search Complete") defer tl.LogTime("Search Complete")
totalPartialHashes := 0 totalPartialHashes := 0
@ -40,15 +41,18 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul
currentHashes, currentPartialHashes := m.getCurrentHashes(searchHash.Kind) currentHashes, currentPartialHashes := m.getCurrentHashes(searchHash.Kind)
potentialMatches := []uint64{} potentialMatches := []uint64{}
for i, partialHash := range SplitHash(searchHash.Hash) { for i, partialHash := range ch.SplitHash(searchHash.Hash) {
potentialMatches = append(potentialMatches, currentPartialHashes[i][partialHash]...) potentialMatches = append(potentialMatches, currentPartialHashes[i][partialHash]...)
} }
totalPartialHashes += len(potentialMatches) totalPartialHashes += len(potentialMatches)
mappedIds := map[int]bool{} mappedIds := map[int]bool{}
for _, match := range Atleast(max, searchHash.Hash, potentialMatches) { for _, match := range ch.Atleast(max, searchHash.Hash, potentialMatches) {
matchedHash := Hash{match.Hash, searchHash.Kind} matchedHash := ch.Hash{
Hash: match.Hash,
Kind: searchHash.Kind,
}
index, count := m.findHash(matchedHash) index, count := m.findHash(matchedHash)
if count < 1 { if count < 1 {
continue continue
@ -60,7 +64,7 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul
} }
mappedIds[idIndex] = true mappedIds[idIndex] = true
foundMatches = append(foundMatches, Result{ foundMatches = append(foundMatches, ch.Result{
Hash: storedHash.Hash, Hash: storedHash.Hash,
ID: storedHash.ID, ID: storedHash.ID,
Distance: 0, Distance: 0,
@ -75,7 +79,7 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul
} }
// getCurrentHashes must have a read lock before using // getCurrentHashes must have a read lock before using
func (m *MapStorage) getCurrentHashes(kind goimagehash.Kind) ([]bmHash, [8]map[uint8][]uint64) { func (m *MapStorage) getCurrentHashes(kind goimagehash.Kind) ([]ch.SavedHash, [8]map[uint8][]uint64) {
if kind == goimagehash.AHash { if kind == goimagehash.AHash {
return m.aHashes, m.partialAHash return m.aHashes, m.partialAHash
} }
@ -88,17 +92,17 @@ func (m *MapStorage) getCurrentHashes(kind goimagehash.Kind) ([]bmHash, [8]map[u
panic("Unknown hash type: " + kind.String()) panic("Unknown hash type: " + kind.String())
} }
func (m *MapStorage) MapHashes(hash ImageHash) { func (m *MapStorage) MapHashes(hash ch.ImageHash) {
m.basicMapStorage.MapHashes(hash) m.basicMapStorage.MapHashes(hash)
for _, hash := range hash.Hashes { for _, hash := range hash.Hashes {
_, partialHashes := m.getCurrentHashes(hash.Kind) _, partialHashes := m.getCurrentHashes(hash.Kind)
for i, partialHash := range SplitHash(hash.Hash) { for i, partialHash := range ch.SplitHash(hash.Hash) {
partialHashes[i][partialHash] = Insert(partialHashes[i][partialHash], hash.Hash) partialHashes[i][partialHash] = ch.Insert(partialHashes[i][partialHash], hash.Hash)
} }
} }
} }
func (m *MapStorage) DecodeHashes(hashes *SavedHashes) error { func (m *MapStorage) DecodeHashes(hashes *ch.SavedHashes) error {
if hashes == nil { if hashes == nil {
return nil return nil
} }
@ -117,7 +121,7 @@ func (m *MapStorage) DecodeHashes(hashes *SavedHashes) error {
return nil return nil
} }
func NewMapStorage() (HashStorage, error) { func NewMapStorage() (ch.HashStorage, error) {
storage := &MapStorage{ storage := &MapStorage{
basicMapStorage: basicMapStorage{ basicMapStorage: basicMapStorage{
@ -125,9 +129,9 @@ func NewMapStorage() (HashStorage, error) {
ids: IDMap{ ids: IDMap{
ids: []IDs{}, ids: []IDs{},
}, },
aHashes: []bmHash{}, aHashes: []ch.SavedHash{},
dHashes: []bmHash{}, dHashes: []ch.SavedHash{},
pHashes: []bmHash{}, pHashes: []ch.SavedHash{},
}, },
partialAHash: newPartialHash(), partialAHash: newPartialHash(),
partialDHash: newPartialHash(), partialDHash: newPartialHash(),
@ -149,9 +153,9 @@ func newPartialHash() [8]map[uint8][]uint64 {
} }
} }
func mapPartialHashes(hashes []bmHash, partialHashMap [8]map[uint8][]uint64) { func mapPartialHashes(hashes []ch.SavedHash, partialHashMap [8]map[uint8][]uint64) {
for _, savedHash := range hashes { for _, savedHash := range hashes {
for i, partialHash := range SplitHash(savedHash.Hash.Hash) { for i, partialHash := range ch.SplitHash(savedHash.Hash.Hash) {
partialHashMap[i][partialHash] = append(partialHashMap[i][partialHash], savedHash.Hash.Hash) partialHashMap[i][partialHash] = append(partialHashMap[i][partialHash], savedHash.Hash.Hash)
} }
} }

View File

@ -1,4 +1,4 @@
package ch package storage
import ( import (
"context" "context"
@ -8,6 +8,7 @@ import (
"log" "log"
"math/bits" "math/bits"
ch "gitea.narnian.us/lordwelch/comic-hasher"
_ "modernc.org/sqlite" _ "modernc.org/sqlite"
) )
@ -26,19 +27,19 @@ type sqliteStorage struct {
idExists *sql.Stmt idExists *sql.Stmt
} }
func (s *sqliteStorage) findExactHashes(statement *sql.Stmt, hash Hash) (map[ID][]ID, error) { func (s *sqliteStorage) findExactHashes(statement *sql.Stmt, hash ch.Hash) (map[ch.ID][]ch.ID, error) {
if statement == nil { if statement == nil {
statement = s.hashExactMatchStatement statement = s.hashExactMatchStatement
} }
hashes := map[ID][]ID{} hashes := map[ch.ID][]ch.ID{}
rows, err := statement.Query(hash.Kind, int64(hash.Hash)) rows, err := statement.Query(hash.Kind, int64(hash.Hash))
if err != nil { if err != nil {
return hashes, err return hashes, err
} }
for rows.Next() { for rows.Next() {
var ( var (
id ID id ch.ID
foundID ID foundID ch.ID
) )
err = rows.Scan(&foundID.Domain, &foundID.ID, &id.Domain, &id.ID) err = rows.Scan(&foundID.Domain, &foundID.ID, &id.Domain, &id.ID)
if err != nil { if err != nil {
@ -51,24 +52,24 @@ func (s *sqliteStorage) findExactHashes(statement *sql.Stmt, hash Hash) (map[ID]
return hashes, nil return hashes, nil
} }
func (s *sqliteStorage) findPartialHashes(tl timeLog, statement *sql.Stmt, max int, hash Hash) ([]Result, error) { func (s *sqliteStorage) findPartialHashes(tl ch.TimeLog, statement *sql.Stmt, max int, hash ch.Hash) ([]ch.Result, error) {
if statement == nil { if statement == nil {
statement = s.hashPartialMatchStatement statement = s.hashPartialMatchStatement
} }
hashResults := []Result{} hashResults := []ch.Result{}
rows, err := statement.Query(hash.Kind, int64(hash.Hash)) rows, err := statement.Query(hash.Kind, int64(hash.Hash))
if err != nil { if err != nil {
return hashResults, err return hashResults, err
} }
results := map[SavedHash][]ID{} results := map[ch.SavedHash][]ch.ID{}
for rows.Next() { for rows.Next() {
var ( var (
tmpHash int64 tmpHash int64
sqlHash = SavedHash{ sqlHash = ch.SavedHash{
Hash: Hash{Kind: hash.Kind}, Hash: ch.Hash{Kind: hash.Kind},
} }
id ID id ch.ID
) )
err = rows.Scan(&sqlHash.ID.Domain, &sqlHash.ID.ID, &tmpHash, &id.Domain, &id.ID) err = rows.Scan(&sqlHash.ID.Domain, &sqlHash.ID.ID, &tmpHash, &id.Domain, &id.ID)
if err != nil { if err != nil {
@ -79,7 +80,7 @@ func (s *sqliteStorage) findPartialHashes(tl timeLog, statement *sql.Stmt, max i
results[sqlHash] = append(results[sqlHash], id) results[sqlHash] = append(results[sqlHash], id)
} }
for sqlHash, ids := range results { for sqlHash, ids := range results {
res := Result{ res := ch.Result{
Hash: sqlHash.Hash, Hash: sqlHash.Hash,
ID: sqlHash.ID, ID: sqlHash.ID,
Distance: bits.OnesCount64(hash.Hash ^ sqlHash.Hash.Hash), Distance: bits.OnesCount64(hash.Hash ^ sqlHash.Hash.Hash),
@ -134,12 +135,12 @@ func (s *sqliteStorage) createIndexes() error {
return nil return nil
} }
func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) { func (s *sqliteStorage) GetMatches(hashes []ch.Hash, max int, exactOnly bool) ([]ch.Result, error) {
var ( var (
foundMatches []Result foundMatches []ch.Result
tl timeLog tl ch.TimeLog
) )
tl.resetTime() tl.ResetTime()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
for _, hash := range hashes { for _, hash := range hashes {
@ -148,7 +149,7 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re
return foundMatches, err return foundMatches, err
} }
for id, equivalentIDs := range idlist { for id, equivalentIDs := range idlist {
foundMatches = append(foundMatches, Result{ foundMatches = append(foundMatches, ch.Result{
Hash: hash, Hash: hash,
ID: id, ID: id,
Distance: 0, Distance: 0,
@ -157,7 +158,7 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re
} }
} }
tl.logTime("Search Exact") tl.LogTime("Search Exact")
if len(foundMatches) > 0 { if len(foundMatches) > 0 {
return foundMatches, nil return foundMatches, nil
} }
@ -170,7 +171,7 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re
if err != nil { if err != nil {
return foundMatches, err return foundMatches, err
} }
tl.logTime(fmt.Sprintf("Search partial %v", hash.Kind)) tl.LogTime(fmt.Sprintf("Search partial %v", hash.Kind))
for _, hash := range results { for _, hash := range results {
if _, alreadyMatched := foundHashes[hash.Hash.Hash]; !alreadyMatched { if _, alreadyMatched := foundHashes[hash.Hash.Hash]; !alreadyMatched {
@ -185,7 +186,7 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re
return foundMatches, nil return foundMatches, nil
} }
func (s *sqliteStorage) mapHashes(tx *sql.Tx, hash ImageHash) { func (s *sqliteStorage) mapHashes(tx *sql.Tx, hash ch.ImageHash) {
var err error var err error
insertHash := tx.Stmt(s.insertHash) insertHash := tx.Stmt(s.insertHash)
insertID := tx.Stmt(s.insertID) insertID := tx.Stmt(s.insertID)
@ -234,7 +235,7 @@ func (s *sqliteStorage) mapHashes(tx *sql.Tx, hash ImageHash) {
} }
} }
} }
func (s *sqliteStorage) MapHashes(hash ImageHash) { func (s *sqliteStorage) MapHashes(hash ch.ImageHash) {
tx, err := s.db.BeginTx(context.Background(), nil) tx, err := s.db.BeginTx(context.Background(), nil)
if err != nil { if err != nil {
panic(err) panic(err)
@ -246,7 +247,7 @@ func (s *sqliteStorage) MapHashes(hash ImageHash) {
} }
} }
func (s *sqliteStorage) DecodeHashes(hashes *SavedHashes) error { func (s *sqliteStorage) DecodeHashes(hashes *ch.SavedHashes) error {
return nil return nil
err := s.dropIndexes() err := s.dropIndexes()
if err != nil { if err != nil {
@ -285,8 +286,8 @@ func (s *sqliteStorage) DecodeHashes(hashes *SavedHashes) error {
} }
for _, savedHash := range hashes.Hashes { for _, savedHash := range hashes.Hashes {
s.mapHashes(tx, ImageHash{ s.mapHashes(tx, ch.ImageHash{
Hashes: []Hash{savedHash.Hash}, Hashes: []ch.Hash{savedHash.Hash},
ID: savedHash.ID, ID: savedHash.ID,
}) })
} }
@ -302,8 +303,8 @@ func (s *sqliteStorage) DecodeHashes(hashes *SavedHashes) error {
return nil return nil
} }
func (s *sqliteStorage) EncodeHashes() (*SavedHashes, error) { func (s *sqliteStorage) EncodeHashes() (*ch.SavedHashes, error) {
hashes := SavedHashes{} hashes := ch.SavedHashes{}
tx, err := s.db.Begin() tx, err := s.db.Begin()
if err != nil { if err != nil {
return &hashes, err return &hashes, err
@ -315,7 +316,7 @@ func (s *sqliteStorage) EncodeHashes() (*SavedHashes, error) {
} }
for rows.Next() { for rows.Next() {
var ( var (
hash SavedHash hash ch.SavedHash
tmpHash int64 tmpHash int64
) )
err = rows.Scan(&hash.Hash.Kind, &tmpHash, &hash.ID.Domain, &hash.ID.ID) err = rows.Scan(&hash.Hash.Kind, &tmpHash, &hash.ID.Domain, &hash.ID.ID)
@ -331,11 +332,11 @@ func (s *sqliteStorage) EncodeHashes() (*SavedHashes, error) {
} }
var ( var (
previousEid int64 = -1 previousEid int64 = -1
ids []ID ids []ch.ID
) )
for rows.Next() { for rows.Next() {
var ( var (
id ID id ch.ID
newEid int64 newEid int64
) )
err = rows.Scan(&newEid, &id.Domain, &id.Domain) err = rows.Scan(&newEid, &id.Domain, &id.Domain)
@ -348,14 +349,14 @@ func (s *sqliteStorage) EncodeHashes() (*SavedHashes, error) {
if len(ids) > 1 { if len(ids) > 1 {
hashes.IDs = append(hashes.IDs, ids) hashes.IDs = append(hashes.IDs, ids)
} }
ids = make([]ID, 0) ids = make([]ch.ID, 0)
} }
ids = append(ids, id) ids = append(ids, id)
} }
return &hashes, nil return &hashes, nil
} }
func (s *sqliteStorage) AssociateIDs(newIDs []NewIDs) error { func (s *sqliteStorage) AssociateIDs(newIDs []ch.NewIDs) error {
tx, err := s.db.BeginTx(context.Background(), nil) tx, err := s.db.BeginTx(context.Background(), nil)
if err != nil { if err != nil {
panic(err) panic(err)
@ -397,21 +398,21 @@ func (s *sqliteStorage) AssociateIDs(newIDs []NewIDs) error {
return nil return nil
} }
func (s *sqliteStorage) GetIDs(id ID) IDList { func (s *sqliteStorage) GetIDs(id ch.ID) ch.IDList {
var ids []ID var ids []ch.ID
rows, err := s.idMatchStatement.Query(id.Domain, id.ID) rows, err := s.idMatchStatement.Query(id.Domain, id.ID)
if err != nil { if err != nil {
return nil return nil
} }
for rows.Next() { for rows.Next() {
var id ID var id ch.ID
err = rows.Scan(&id.Domain, &id.ID) err = rows.Scan(&id.Domain, &id.ID)
if err != nil { if err != nil {
return nil return nil
} }
ids = append(ids, id) ids = append(ids, id)
} }
return ToIDList(ids) return ch.ToIDList(ids)
} }
func (s *sqliteStorage) PrepareStatements() error { func (s *sqliteStorage) PrepareStatements() error {
@ -480,7 +481,7 @@ func (s *sqliteStorage) PrepareStatements() error {
return nil return nil
} }
func NewSqliteStorage(db, path string) (HashStorage, error) { func NewSqliteStorage(db, path string) (ch.HashStorage, error) {
sqlite := &sqliteStorage{} sqlite := &sqliteStorage{}
sqlDB, err := sql.Open(db, fmt.Sprintf("file://%s?_pragma=cache_size(-200000)&_pragma=busy_timeout(500)&_pragma=hard_heap_limit(1073741824)&_pragma=journal_mode(wal)&_pragma=soft_heap_limit(314572800)", path)) sqlDB, err := sql.Open(db, fmt.Sprintf("file://%s?_pragma=cache_size(-200000)&_pragma=busy_timeout(500)&_pragma=hard_heap_limit(1073741824)&_pragma=journal_mode(wal)&_pragma=soft_heap_limit(314572800)", path))
if err != nil { if err != nil {

View File

@ -1,6 +1,6 @@
//go:build cgo && !gokrazy //go:build cgo && !gokrazy
package ch package storage
import ( import (
_ "github.com/mattn/go-sqlite3" _ "github.com/mattn/go-sqlite3"

View File

@ -1,6 +1,6 @@
//go:build !cgo && !gokrazy //go:build !cgo && !gokrazy
package ch package storage
import ( import (
_ "github.com/ncruces/go-sqlite3/driver" _ "github.com/ncruces/go-sqlite3/driver"

View File

@ -1,12 +1,13 @@
//go:build !gokrazy //go:build !gokrazy
package ch package storage
import ( import (
"errors" "errors"
"fmt" "fmt"
"math/bits" "math/bits"
ch "gitea.narnian.us/lordwelch/comic-hasher"
"gitea.narnian.us/lordwelch/goimagehash" "gitea.narnian.us/lordwelch/goimagehash"
"gonum.org/v1/gonum/spatial/vptree" "gonum.org/v1/gonum/spatial/vptree"
) )
@ -15,14 +16,14 @@ type VPTree struct {
aTree *vptree.Tree aTree *vptree.Tree
dTree *vptree.Tree dTree *vptree.Tree
pTree *vptree.Tree pTree *vptree.Tree
ids map[ID]*[]ID ids map[ch.ID]*[]ch.ID
aHashes []vptree.Comparable // temporary, only used for vptree creation aHashes []vptree.Comparable // temporary, only used for vptree creation
dHashes []vptree.Comparable // temporary, only used for vptree creation dHashes []vptree.Comparable // temporary, only used for vptree creation
pHashes []vptree.Comparable // temporary, only used for vptree creation pHashes []vptree.Comparable // temporary, only used for vptree creation
} }
type VPHash struct { type VPHash struct {
SavedHash ch.SavedHash
} }
func (h *VPHash) Distance(c vptree.Comparable) float64 { func (h *VPHash) Distance(c vptree.Comparable) float64 {
@ -33,22 +34,22 @@ func (h *VPHash) Distance(c vptree.Comparable) float64 {
return float64(bits.OnesCount64(h.Hash.Hash ^ h2.Hash.Hash)) return float64(bits.OnesCount64(h.Hash.Hash ^ h2.Hash.Hash))
} }
func (v *VPTree) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) { func (v *VPTree) GetMatches(hashes []ch.Hash, max int, exactOnly bool) ([]ch.Result, error) {
var ( var (
matches []Result matches []ch.Result
exactMatches []Result exactMatches []ch.Result
tl timeLog tl ch.TimeLog
) )
tl.resetTime() tl.ResetTime()
defer tl.logTime("Search Complete") defer tl.LogTime("Search Complete")
for _, hash := range hashes { for _, hash := range hashes {
results := vptree.NewDistKeeper(float64(max)) results := vptree.NewDistKeeper(float64(max))
currentTree := v.getCurrentTree(hash.Kind) currentTree := v.getCurrentTree(hash.Kind)
currentTree.NearestSet(results, &VPHash{SavedHash{Hash: hash}}) currentTree.NearestSet(results, &VPHash{ch.SavedHash{Hash: hash}})
mappedIds := map[*[]ID]bool{} mappedIds := map[*[]ch.ID]bool{}
for _, result := range results.Heap { for _, result := range results.Heap {
storedHash := result.Comparable.(*VPHash) storedHash := result.Comparable.(*VPHash)
ids := v.ids[storedHash.ID] ids := v.ids[storedHash.ID]
@ -57,14 +58,14 @@ func (v *VPTree) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, e
} }
mappedIds[ids] = true mappedIds[ids] = true
if result.Dist == 0 { if result.Dist == 0 {
exactMatches = append(exactMatches, Result{ exactMatches = append(exactMatches, ch.Result{
Hash: storedHash.Hash, Hash: storedHash.Hash,
ID: storedHash.ID, ID: storedHash.ID,
Distance: 0, Distance: 0,
EquivalentIDs: *v.ids[storedHash.ID], EquivalentIDs: *v.ids[storedHash.ID],
}) })
} else { } else {
matches = append(matches, Result{ matches = append(matches, ch.Result{
Hash: storedHash.Hash, Hash: storedHash.Hash,
ID: storedHash.ID, ID: storedHash.ID,
Distance: 0, Distance: 0,
@ -93,11 +94,11 @@ func (v *VPTree) getCurrentTree(kind goimagehash.Kind) *vptree.Tree {
panic("Unknown hash type: " + kind.String()) panic("Unknown hash type: " + kind.String())
} }
func (v *VPTree) MapHashes(ImageHash) { func (v *VPTree) MapHashes(ch.ImageHash) {
panic("Not Implemented") panic("Not Implemented")
} }
func (v *VPTree) DecodeHashes(hashes *SavedHashes) error { func (v *VPTree) DecodeHashes(hashes *ch.SavedHashes) error {
if hashes == nil { if hashes == nil {
return nil return nil
} }
@ -120,13 +121,13 @@ func (v *VPTree) DecodeHashes(hashes *SavedHashes) error {
v.pHashes = append(v.pHashes, &VPHash{savedHash}) v.pHashes = append(v.pHashes, &VPHash{savedHash})
} }
if savedHash.ID == (ID{}) { if savedHash.ID == (ch.ID{}) {
fmt.Println("Empty ID detected") fmt.Println("Empty ID detected")
panic(savedHash) panic(savedHash)
} }
// All known equal IDs are already mapped we can add any missing ones from hashes // All known equal IDs are already mapped we can add any missing ones from hashes
if _, ok := v.ids[savedHash.ID]; !ok { if _, ok := v.ids[savedHash.ID]; !ok {
v.ids[savedHash.ID] = &[]ID{savedHash.ID} v.ids[savedHash.ID] = &[]ch.ID{savedHash.ID}
} }
} }
@ -144,23 +145,23 @@ func (v *VPTree) DecodeHashes(hashes *SavedHashes) error {
} }
return nil return nil
} }
func (v *VPTree) EncodeHashes() (*SavedHashes, error) { func (v *VPTree) EncodeHashes() (*ch.SavedHashes, error) {
return &SavedHashes{}, errors.New("Not Implemented") return &ch.SavedHashes{}, errors.New("Not Implemented")
} }
func (v *VPTree) AssociateIDs(newIDs []NewIDs) error { func (v *VPTree) AssociateIDs(newIDs []ch.NewIDs) error {
return errors.New("Not Implemented") return errors.New("Not Implemented")
} }
func (v *VPTree) GetIDs(id ID) IDList { func (v *VPTree) GetIDs(id ch.ID) ch.IDList {
ids, found := v.ids[id] ids, found := v.ids[id]
if !found { if !found {
return nil return nil
} }
return ToIDList(*ids) return ch.ToIDList(*ids)
} }
func NewVPStorage() (HashStorage, error) { func NewVPStorage() (ch.HashStorage, error) {
var err error var err error
v := &VPTree{ v := &VPTree{
aHashes: []vptree.Comparable{}, aHashes: []vptree.Comparable{},

View File

@ -0,0 +1,13 @@
//go:build gokrazy
package storage
import (
"errors"
ch "gitea.narnian.us/lordwelch/comic-hasher"
)
func NewVPStorage() (ch.HashStorage, error) {
return nil, errors.New("VPTree not available")
}

View File

@ -5,17 +5,17 @@ import (
"time" "time"
) )
type timeLog struct { type TimeLog struct {
total time.Duration total time.Duration
last time.Time last time.Time
} }
func (t *timeLog) resetTime() { func (t *TimeLog) ResetTime() {
t.total = 0 t.total = 0
t.last = time.Now() t.last = time.Now()
} }
func (t *timeLog) logTime(log string) { func (t *TimeLog) LogTime(log string) {
now := time.Now() now := time.Now()
diff := now.Sub(t.last) diff := now.Sub(t.last)
t.last = now t.last = now