Compare commits

..

1 Commits

Author SHA1 Message Date
22d59aa221 Move HashStorage to its own package 2025-05-31 19:00:40 -07:00
10 changed files with 266 additions and 245 deletions

View File

@ -35,6 +35,7 @@ import (
ch "gitea.narnian.us/lordwelch/comic-hasher"
"gitea.narnian.us/lordwelch/comic-hasher/cv"
"gitea.narnian.us/lordwelch/comic-hasher/storage"
)
var bufPool = &sync.Pool{
@ -215,15 +216,15 @@ func signalHandler(s *Server) {
func initializeStorage(opts Opts) (ch.HashStorage, error) {
switch opts.storageType {
case Map:
return ch.NewMapStorage()
return storage.NewMapStorage()
case BasicMap:
return ch.NewBasicMapStorage()
return storage.NewBasicMapStorage()
case Sqlite:
return ch.NewSqliteStorage("sqlite", opts.sqlitePath)
return storage.NewSqliteStorage("sqlite", opts.sqlitePath)
case Sqlite3:
return ch.NewSqliteStorage("sqlite3", opts.sqlitePath)
return storage.NewSqliteStorage("sqlite3", opts.sqlitePath)
case VPTree:
return ch.NewVPStorage()
return storage.NewVPStorage()
}
return nil, errors.New("Unknown storage type provided")
}

View File

@ -83,6 +83,19 @@ func (f *Format) Set(s string) error {
return nil
}
func (h *SavedHash) Clone() SavedHash {
return SavedHash{
Hash: Hash{
Hash: h.Hash.Hash,
Kind: h.Hash.Kind,
},
ID: ID{
Domain: NewSource(*h.ID.Domain),
ID: strings.Clone(h.ID.ID),
},
}
}
func (s *SavedHashes) InsertHash(hash SavedHash) {
index, itemFound := slices.BinarySearchFunc(s.Hashes, hash, func(existing SavedHash, target SavedHash) int {
return cmp.Or(

View File

@ -1,4 +1,4 @@
package ch
package storage
import (
"cmp"
@ -6,49 +6,34 @@ import (
"fmt"
"math/bits"
"slices"
"strings"
"sync"
ch "gitea.narnian.us/lordwelch/comic-hasher"
"gitea.narnian.us/lordwelch/goimagehash"
)
type bmHash struct {
Hash Hash
ID ID
}
func NewbmHash(data SavedHash) bmHash {
return bmHash{
Hash: Hash{
Hash: data.Hash.Hash,
Kind: data.Hash.Kind,
},
ID: ID{
Domain: data.ID.Domain,
ID: strings.Clone(data.ID.ID),
},
}
}
type basicMapStorage struct {
hashMutex *sync.RWMutex
ids IDMap
aHashes []bmHash
dHashes []bmHash
pHashes []bmHash
aHashes []ch.SavedHash
dHashes []ch.SavedHash
pHashes []ch.SavedHash
}
type IDs struct {
id *ID
idList *[]*ID
id *ch.ID
idList *[]*ch.ID
}
type IDMap struct {
ids []IDs
}
func (m *IDMap) InsertID(id *ID) *ID {
return m.insertID(id, &[]*ID{id})
func (m *IDMap) InsertID(id *ch.ID) *ch.ID {
return m.insertID(id, &[]*ch.ID{id})
}
func (m *IDMap) insertID(id *ID, idList *[]*ID) *ID {
index, found := slices.BinarySearchFunc(m.ids, id, func(id IDs, target *ID) int {
func (m *IDMap) insertID(id *ch.ID, idList *[]*ch.ID) *ch.ID {
index, found := slices.BinarySearchFunc(m.ids, id, func(id IDs, target *ch.ID) int {
return id.id.Compare(*target)
})
if !found {
@ -66,40 +51,40 @@ func (m *IDMap) sort() {
})
}
func (m *IDMap) FindID(id *ID) (int, bool) {
return slices.BinarySearchFunc(m.ids, id, func(id IDs, target *ID) int {
func (m *IDMap) FindID(id *ch.ID) (int, bool) {
return slices.BinarySearchFunc(m.ids, id, func(id IDs, target *ch.ID) int {
return id.id.Compare(*target)
})
}
func (m *IDMap) GetIDs(id *ID) []ID {
func (m *IDMap) GetIDs(id *ch.ID) []ch.ID {
index, found := m.FindID(id)
if !found {
return nil
}
ids := make([]ID, 0, len(*m.ids[index].idList))
ids := make([]ch.ID, 0, len(*m.ids[index].idList))
for _, id := range *m.ids[index].idList {
ids = append(ids, *id)
}
return ids
}
func (m *IDMap) AssociateIDs(newids []NewIDs) error {
func (m *IDMap) AssociateIDs(newids []ch.NewIDs) error {
for _, newid := range newids {
index, found := m.FindID(&newid.OldID)
if !found {
return ErrIDNotFound
}
*(m.ids[index].idList) = InsertIDp(*(m.ids[index].idList), &newid.NewID)
*(m.ids[index].idList) = ch.InsertIDp(*(m.ids[index].idList), &newid.NewID)
m.insertID(&newid.NewID, m.ids[index].idList)
}
return nil
}
// func (m *IDMap) NewID(domain Source, id string) *ID {
// newID := ID{domain, id}
// index, found := slices.BinarySearchFunc(m.idList, newID, func(id *ID, target ID) int {
// func (m *IDMap) NewID(domain Source, id string) *ch.ID {
// newID := ch.ID{domain, id}
// index, found := slices.BinarySearchFunc(m.idList, newID, func(id *ch.ID, target ch.ID) int {
// return id.Compare(*target)
// })
// if !found {
@ -111,11 +96,11 @@ func (m *IDMap) AssociateIDs(newids []NewIDs) error {
var ErrIDNotFound = errors.New("ID not found on this server")
// atleast must have a read lock before using
func (b *basicMapStorage) atleast(kind goimagehash.Kind, maxDistance int, searchHash uint64) []Result {
matchingHashes := make([]Result, 0, 20) // hope that we don't need more
func (b *basicMapStorage) atleast(kind goimagehash.Kind, maxDistance int, searchHash uint64) []ch.Result {
matchingHashes := make([]ch.Result, 0, 20) // hope that we don't need more
mappedIds := map[int]bool{}
storedHash := bmHash{} // reduces allocations and ensures queries are <1s
storedHash := ch.SavedHash{} // reduces allocations and ensures queries are <1s
for _, storedHash = range *b.getCurrentHashes(kind) {
distance := bits.OnesCount64(searchHash ^ storedHash.Hash.Hash)
if distance <= maxDistance {
@ -124,7 +109,7 @@ func (b *basicMapStorage) atleast(kind goimagehash.Kind, maxDistance int, search
continue
}
mappedIds[index] = true
matchingHashes = append(matchingHashes, Result{
matchingHashes = append(matchingHashes, ch.Result{
Hash: storedHash.Hash,
ID: storedHash.ID,
Distance: distance,
@ -135,8 +120,8 @@ func (b *basicMapStorage) atleast(kind goimagehash.Kind, maxDistance int, search
return matchingHashes
}
func (b *basicMapStorage) exactMatches(hashes []Hash, max int) []Result {
var foundMatches []Result
func (b *basicMapStorage) exactMatches(hashes []ch.Hash, max int) []ch.Result {
var foundMatches []ch.Result
for _, hash := range hashes {
mappedIds := map[int]bool{}
@ -149,7 +134,7 @@ func (b *basicMapStorage) exactMatches(hashes []Hash, max int) []Result {
}
mappedIds[index] = true
foundMatches = append(foundMatches, Result{
foundMatches = append(foundMatches, ch.Result{
Hash: storedHash.Hash,
ID: storedHash.ID,
Distance: 0,
@ -162,20 +147,20 @@ func (b *basicMapStorage) exactMatches(hashes []Hash, max int) []Result {
return foundMatches
}
func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
func (b *basicMapStorage) GetMatches(hashes []ch.Hash, max int, exactOnly bool) ([]ch.Result, error) {
var (
foundMatches []Result
tl timeLog
foundMatches []ch.Result
tl ch.TimeLog
)
tl.resetTime()
defer tl.logTime(fmt.Sprintf("Search Complete: max: %v ExactOnly: %v", max, exactOnly))
tl.ResetTime()
defer tl.LogTime(fmt.Sprintf("Search Complete: max: %v ExactOnly: %v", max, exactOnly))
b.hashMutex.RLock()
defer b.hashMutex.RUnlock()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
foundMatches = b.exactMatches(hashes, max)
tl.logTime("Search Exact")
tl.LogTime("Search Exact")
if len(foundMatches) > 0 {
return foundMatches, nil
}
@ -193,7 +178,7 @@ func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]
}
// getCurrentHashes must have a read lock before using
func (b *basicMapStorage) getCurrentHashes(kind goimagehash.Kind) *[]bmHash {
func (b *basicMapStorage) getCurrentHashes(kind goimagehash.Kind) *[]ch.SavedHash {
if kind == goimagehash.AHash {
return &b.aHashes
}
@ -209,9 +194,9 @@ func (b *basicMapStorage) getCurrentHashes(kind goimagehash.Kind) *[]bmHash {
// findHash must have a read lock before using
// return value is index, count
// if count < 1 then no results were found
func (b *basicMapStorage) findHash(hash Hash) (int, int) {
func (b *basicMapStorage) findHash(hash ch.Hash) (int, int) {
currentHashes := *b.getCurrentHashes(hash.Kind)
index, found := slices.BinarySearchFunc(currentHashes, hash, func(existing bmHash, target Hash) int {
index, found := slices.BinarySearchFunc(currentHashes, hash, func(existing ch.SavedHash, target ch.Hash) int {
return cmp.Compare(existing.Hash.Hash, target.Hash)
})
if !found {
@ -225,7 +210,7 @@ func (b *basicMapStorage) findHash(hash Hash) (int, int) {
}
// insertHash must already have a lock
func (b *basicMapStorage) insertHash(hash Hash, id ID) {
func (b *basicMapStorage) insertHash(hash ch.Hash, id ch.ID) {
currentHashes := b.getCurrentHashes(hash.Kind)
index, count := b.findHash(hash)
max := index + count
@ -235,12 +220,15 @@ func (b *basicMapStorage) insertHash(hash Hash, id ID) {
}
}
sh := bmHash{hash, id}
sh := ch.SavedHash{
Hash: hash,
ID: id,
}
*currentHashes = slices.Insert(*currentHashes, index, sh)
b.ids.InsertID(&sh.ID)
}
func (b *basicMapStorage) MapHashes(hash ImageHash) {
func (b *basicMapStorage) MapHashes(hash ch.ImageHash) {
b.hashMutex.Lock()
defer b.hashMutex.Unlock()
for _, ih := range hash.Hashes {
@ -249,7 +237,7 @@ func (b *basicMapStorage) MapHashes(hash ImageHash) {
}
// DecodeHashes must already have a lock
func (b *basicMapStorage) DecodeHashes(hashes *SavedHashes) error {
func (b *basicMapStorage) DecodeHashes(hashes *ch.SavedHashes) error {
if hashes == nil {
return nil
}
@ -257,7 +245,7 @@ func (b *basicMapStorage) DecodeHashes(hashes *SavedHashes) error {
// Initialize all the known equal IDs
for _, ids := range hashes.IDs {
new_ids := make([]*ID, 0, len(ids))
new_ids := make([]*ch.ID, 0, len(ids))
for _, id := range ids {
new_ids = append(new_ids, &id)
}
@ -270,7 +258,7 @@ func (b *basicMapStorage) DecodeHashes(hashes *SavedHashes) error {
}
b.ids.sort()
slices.SortFunc(hashes.Hashes, func(existing, target SavedHash) int {
slices.SortFunc(hashes.Hashes, func(existing, target ch.SavedHash) int {
return cmp.Or(
cmp.Compare(*existing.ID.Domain, *target.ID.Domain), // Sorted for id insertion efficiency
cmp.Compare(existing.ID.ID, target.ID.ID), // Sorted for id insertion efficiency
@ -295,31 +283,31 @@ func (b *basicMapStorage) DecodeHashes(hashes *SavedHashes) error {
}
// Assume they are probably fairly equally split between hash types
b.aHashes = make([]bmHash, 0, aHashCount)
b.dHashes = make([]bmHash, 0, dHashCount)
b.pHashes = make([]bmHash, 0, pHashCount)
b.aHashes = make([]ch.SavedHash, 0, aHashCount)
b.dHashes = make([]ch.SavedHash, 0, dHashCount)
b.pHashes = make([]ch.SavedHash, 0, pHashCount)
for i := range hashes.Hashes {
bmhash := NewbmHash(hashes.Hashes[i])
hash := hashes.Hashes[i].Clone() // Not cloning this will keep strings/slices loaded from json wasting memory
if hashes.Hashes[i].Hash.Kind == goimagehash.AHash {
b.aHashes = append(b.aHashes, bmhash)
b.aHashes = append(b.aHashes, hash)
}
if hashes.Hashes[i].Hash.Kind == goimagehash.DHash {
b.dHashes = append(b.dHashes, bmhash)
b.dHashes = append(b.dHashes, hash)
}
if hashes.Hashes[i].Hash.Kind == goimagehash.PHash {
b.pHashes = append(b.pHashes, bmhash)
b.pHashes = append(b.pHashes, hash)
}
if hashes.Hashes[i].ID == (ID{}) {
if hashes.Hashes[i].ID == (ch.ID{}) {
fmt.Println("Empty ID detected")
panic(hashes.Hashes[i])
}
// TODO: Make loading this more efficient
// All known equal IDs are already mapped we can add any missing ones from hashes
b.ids.InsertID(&bmhash.ID)
b.ids.InsertID(&hash.ID)
}
hashCmp := func(existing, target bmHash) int {
hashCmp := func(existing, target ch.SavedHash) int {
return cmp.Or(
cmp.Compare(existing.Hash.Hash, target.Hash.Hash),
cmp.Compare(*existing.ID.Domain, *target.ID.Domain),
@ -334,9 +322,9 @@ func (b *basicMapStorage) DecodeHashes(hashes *SavedHashes) error {
}
// EncodeHashes should already have a lock
func (b *basicMapStorage) EncodeHashes() (*SavedHashes, error) {
savedHashes := SavedHashes{
Hashes: make([]SavedHash, 0, len(b.aHashes)+len(b.dHashes)+len(b.pHashes)),
func (b *basicMapStorage) EncodeHashes() (*ch.SavedHashes, error) {
savedHashes := ch.SavedHashes{
Hashes: make([]ch.SavedHash, 0, len(b.aHashes)+len(b.dHashes)+len(b.pHashes)),
}
// savedHashes.Hashes = append(savedHashes.Hashes, b.aHashes...)
// savedHashes.Hashes = append(savedHashes.Hashes, b.dHashes...)
@ -357,28 +345,28 @@ func (b *basicMapStorage) EncodeHashes() (*SavedHashes, error) {
return &savedHashes, nil
}
func (b *basicMapStorage) AssociateIDs(newids []NewIDs) error {
func (b *basicMapStorage) AssociateIDs(newids []ch.NewIDs) error {
b.hashMutex.RLock()
defer b.hashMutex.RUnlock()
return b.ids.AssociateIDs(newids)
}
func (b *basicMapStorage) GetIDs(id ID) IDList {
func (b *basicMapStorage) GetIDs(id ch.ID) ch.IDList {
b.hashMutex.RLock()
defer b.hashMutex.RUnlock()
ids := b.ids.GetIDs(&id)
return ToIDList(ids)
return ch.ToIDList(ids)
}
func NewBasicMapStorage() (HashStorage, error) {
func NewBasicMapStorage() (ch.HashStorage, error) {
storage := &basicMapStorage{
hashMutex: &sync.RWMutex{},
ids: IDMap{
ids: []IDs{},
},
aHashes: []bmHash{},
dHashes: []bmHash{},
pHashes: []bmHash{},
aHashes: []ch.SavedHash{},
dHashes: []ch.SavedHash{},
pHashes: []ch.SavedHash{},
}
return storage, nil
}

View File

@ -1,10 +1,11 @@
package ch
package storage
import (
"fmt"
"slices"
"sync"
ch "gitea.narnian.us/lordwelch/comic-hasher"
"gitea.narnian.us/lordwelch/goimagehash"
)
@ -15,10 +16,10 @@ type MapStorage struct {
partialPHash [8]map[uint8][]uint64
}
func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
func (m *MapStorage) GetMatches(hashes []ch.Hash, max int, exactOnly bool) ([]ch.Result, error) {
var (
foundMatches []Result
tl timeLog
foundMatches []ch.Result
tl ch.TimeLog
)
m.hashMutex.RLock()
defer m.hashMutex.RUnlock()
@ -26,13 +27,13 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
foundMatches = m.exactMatches(hashes, max)
tl.logTime("Search Exact")
tl.LogTime("Search Exact")
if len(foundMatches) > 0 {
return foundMatches, nil
}
}
tl.resetTime()
defer tl.logTime("Search Complete")
tl.ResetTime()
defer tl.LogTime("Search Complete")
totalPartialHashes := 0
@ -40,15 +41,18 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul
currentHashes, currentPartialHashes := m.getCurrentHashes(searchHash.Kind)
potentialMatches := []uint64{}
for i, partialHash := range SplitHash(searchHash.Hash) {
for i, partialHash := range ch.SplitHash(searchHash.Hash) {
potentialMatches = append(potentialMatches, currentPartialHashes[i][partialHash]...)
}
totalPartialHashes += len(potentialMatches)
mappedIds := map[int]bool{}
for _, match := range Atleast(max, searchHash.Hash, potentialMatches) {
matchedHash := Hash{match.Hash, searchHash.Kind}
for _, match := range ch.Atleast(max, searchHash.Hash, potentialMatches) {
matchedHash := ch.Hash{
Hash: match.Hash,
Kind: searchHash.Kind,
}
index, count := m.findHash(matchedHash)
if count < 1 {
continue
@ -60,7 +64,7 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul
}
mappedIds[idIndex] = true
foundMatches = append(foundMatches, Result{
foundMatches = append(foundMatches, ch.Result{
Hash: storedHash.Hash,
ID: storedHash.ID,
Distance: 0,
@ -75,7 +79,7 @@ func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Resul
}
// getCurrentHashes must have a read lock before using
func (m *MapStorage) getCurrentHashes(kind goimagehash.Kind) ([]bmHash, [8]map[uint8][]uint64) {
func (m *MapStorage) getCurrentHashes(kind goimagehash.Kind) ([]ch.SavedHash, [8]map[uint8][]uint64) {
if kind == goimagehash.AHash {
return m.aHashes, m.partialAHash
}
@ -88,17 +92,17 @@ func (m *MapStorage) getCurrentHashes(kind goimagehash.Kind) ([]bmHash, [8]map[u
panic("Unknown hash type: " + kind.String())
}
func (m *MapStorage) MapHashes(hash ImageHash) {
func (m *MapStorage) MapHashes(hash ch.ImageHash) {
m.basicMapStorage.MapHashes(hash)
for _, hash := range hash.Hashes {
_, partialHashes := m.getCurrentHashes(hash.Kind)
for i, partialHash := range SplitHash(hash.Hash) {
partialHashes[i][partialHash] = Insert(partialHashes[i][partialHash], hash.Hash)
for i, partialHash := range ch.SplitHash(hash.Hash) {
partialHashes[i][partialHash] = ch.Insert(partialHashes[i][partialHash], hash.Hash)
}
}
}
func (m *MapStorage) DecodeHashes(hashes *SavedHashes) error {
func (m *MapStorage) DecodeHashes(hashes *ch.SavedHashes) error {
if hashes == nil {
return nil
}
@ -117,7 +121,7 @@ func (m *MapStorage) DecodeHashes(hashes *SavedHashes) error {
return nil
}
func NewMapStorage() (HashStorage, error) {
func NewMapStorage() (ch.HashStorage, error) {
storage := &MapStorage{
basicMapStorage: basicMapStorage{
@ -125,9 +129,9 @@ func NewMapStorage() (HashStorage, error) {
ids: IDMap{
ids: []IDs{},
},
aHashes: []bmHash{},
dHashes: []bmHash{},
pHashes: []bmHash{},
aHashes: []ch.SavedHash{},
dHashes: []ch.SavedHash{},
pHashes: []ch.SavedHash{},
},
partialAHash: newPartialHash(),
partialDHash: newPartialHash(),
@ -149,9 +153,9 @@ func newPartialHash() [8]map[uint8][]uint64 {
}
}
func mapPartialHashes(hashes []bmHash, partialHashMap [8]map[uint8][]uint64) {
func mapPartialHashes(hashes []ch.SavedHash, partialHashMap [8]map[uint8][]uint64) {
for _, savedHash := range hashes {
for i, partialHash := range SplitHash(savedHash.Hash.Hash) {
for i, partialHash := range ch.SplitHash(savedHash.Hash.Hash) {
partialHashMap[i][partialHash] = append(partialHashMap[i][partialHash], savedHash.Hash.Hash)
}
}

View File

@ -1,4 +1,4 @@
package ch
package storage
import (
"context"
@ -8,6 +8,7 @@ import (
"log"
"math/bits"
ch "gitea.narnian.us/lordwelch/comic-hasher"
_ "modernc.org/sqlite"
)
@ -26,19 +27,19 @@ type sqliteStorage struct {
idExists *sql.Stmt
}
func (s *sqliteStorage) findExactHashes(statement *sql.Stmt, hash Hash) (map[ID][]ID, error) {
func (s *sqliteStorage) findExactHashes(statement *sql.Stmt, hash ch.Hash) (map[ch.ID][]ch.ID, error) {
if statement == nil {
statement = s.hashExactMatchStatement
}
hashes := map[ID][]ID{}
hashes := map[ch.ID][]ch.ID{}
rows, err := statement.Query(hash.Kind, int64(hash.Hash))
if err != nil {
return hashes, err
}
for rows.Next() {
var (
id ID
foundID ID
id ch.ID
foundID ch.ID
)
err = rows.Scan(&foundID.Domain, &foundID.ID, &id.Domain, &id.ID)
if err != nil {
@ -51,24 +52,24 @@ func (s *sqliteStorage) findExactHashes(statement *sql.Stmt, hash Hash) (map[ID]
return hashes, nil
}
func (s *sqliteStorage) findPartialHashes(tl timeLog, statement *sql.Stmt, max int, hash Hash) ([]Result, error) {
func (s *sqliteStorage) findPartialHashes(tl ch.TimeLog, statement *sql.Stmt, max int, hash ch.Hash) ([]ch.Result, error) {
if statement == nil {
statement = s.hashPartialMatchStatement
}
hashResults := []Result{}
hashResults := []ch.Result{}
rows, err := statement.Query(hash.Kind, int64(hash.Hash))
if err != nil {
return hashResults, err
}
results := map[SavedHash][]ID{}
results := map[ch.SavedHash][]ch.ID{}
for rows.Next() {
var (
tmpHash int64
sqlHash = SavedHash{
Hash: Hash{Kind: hash.Kind},
sqlHash = ch.SavedHash{
Hash: ch.Hash{Kind: hash.Kind},
}
id ID
id ch.ID
)
err = rows.Scan(&sqlHash.ID.Domain, &sqlHash.ID.ID, &tmpHash, &id.Domain, &id.ID)
if err != nil {
@ -79,7 +80,7 @@ func (s *sqliteStorage) findPartialHashes(tl timeLog, statement *sql.Stmt, max i
results[sqlHash] = append(results[sqlHash], id)
}
for sqlHash, ids := range results {
res := Result{
res := ch.Result{
Hash: sqlHash.Hash,
ID: sqlHash.ID,
Distance: bits.OnesCount64(hash.Hash ^ sqlHash.Hash.Hash),
@ -94,18 +95,18 @@ func (s *sqliteStorage) findPartialHashes(tl timeLog, statement *sql.Stmt, max i
func (s *sqliteStorage) dropIndexes() error {
_, err := s.db.Exec(`
DROP INDEX IF EXISTS hash_index;
DROP INDEX IF EXISTS hash_1_index;
DROP INDEX IF EXISTS hash_2_index;
DROP INDEX IF EXISTS hash_3_index;
DROP INDEX IF EXISTS hash_4_index;
DROP INDEX IF EXISTS hash_5_index;
DROP INDEX IF EXISTS hash_6_index;
DROP INDEX IF EXISTS hash_7_index;
DROP INDEX IF EXISTS hash_8_index;
DROP INDEX IF EXISTS hash_index;
DROP INDEX IF EXISTS hash_1_index;
DROP INDEX IF EXISTS hash_2_index;
DROP INDEX IF EXISTS hash_3_index;
DROP INDEX IF EXISTS hash_4_index;
DROP INDEX IF EXISTS hash_5_index;
DROP INDEX IF EXISTS hash_6_index;
DROP INDEX IF EXISTS hash_7_index;
DROP INDEX IF EXISTS hash_8_index;
DROP INDEX IF EXISTS id_domain;
`)
DROP INDEX IF EXISTS id_domain;
`)
if err != nil {
return err
}
@ -114,32 +115,32 @@ func (s *sqliteStorage) dropIndexes() error {
func (s *sqliteStorage) createIndexes() error {
_, err := s.db.Exec(`
CREATE INDEX IF NOT EXISTS hash_index ON Hashes (kind, hash);
CREATE INDEX IF NOT EXISTS hash_1_index ON Hashes ((hash >> (0 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_2_index ON Hashes ((hash >> (1 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_3_index ON Hashes ((hash >> (2 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_4_index ON Hashes ((hash >> (3 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_5_index ON Hashes ((hash >> (4 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_6_index ON Hashes ((hash >> (5 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_7_index ON Hashes ((hash >> (6 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_8_index ON Hashes ((hash >> (7 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_index ON Hashes (kind, hash);
CREATE INDEX IF NOT EXISTS hash_1_index ON Hashes ((hash >> (0 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_2_index ON Hashes ((hash >> (1 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_3_index ON Hashes ((hash >> (2 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_4_index ON Hashes ((hash >> (3 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_5_index ON Hashes ((hash >> (4 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_6_index ON Hashes ((hash >> (5 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_7_index ON Hashes ((hash >> (6 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_8_index ON Hashes ((hash >> (7 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS id_domain ON IDs (domain, stringid);
PRAGMA shrink_memory;
ANALYZE;
`)
CREATE INDEX IF NOT EXISTS id_domain ON IDs (domain, stringid);
PRAGMA shrink_memory;
ANALYZE;
`)
if err != nil {
return err
}
return nil
}
func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
func (s *sqliteStorage) GetMatches(hashes []ch.Hash, max int, exactOnly bool) ([]ch.Result, error) {
var (
foundMatches []Result
tl timeLog
foundMatches []ch.Result
tl ch.TimeLog
)
tl.resetTime()
tl.ResetTime()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
for _, hash := range hashes {
@ -148,7 +149,7 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re
return foundMatches, err
}
for id, equivalentIDs := range idlist {
foundMatches = append(foundMatches, Result{
foundMatches = append(foundMatches, ch.Result{
Hash: hash,
ID: id,
Distance: 0,
@ -157,7 +158,7 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re
}
}
tl.logTime("Search Exact")
tl.LogTime("Search Exact")
if len(foundMatches) > 0 {
return foundMatches, nil
}
@ -170,7 +171,7 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re
if err != nil {
return foundMatches, err
}
tl.logTime(fmt.Sprintf("Search partial %v", hash.Kind))
tl.LogTime(fmt.Sprintf("Search partial %v", hash.Kind))
for _, hash := range results {
if _, alreadyMatched := foundHashes[hash.Hash.Hash]; !alreadyMatched {
@ -185,7 +186,7 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re
return foundMatches, nil
}
func (s *sqliteStorage) mapHashes(tx *sql.Tx, hash ImageHash) {
func (s *sqliteStorage) mapHashes(tx *sql.Tx, hash ch.ImageHash) {
var err error
insertHash := tx.Stmt(s.insertHash)
insertID := tx.Stmt(s.insertID)
@ -234,7 +235,7 @@ func (s *sqliteStorage) mapHashes(tx *sql.Tx, hash ImageHash) {
}
}
}
func (s *sqliteStorage) MapHashes(hash ImageHash) {
func (s *sqliteStorage) MapHashes(hash ch.ImageHash) {
tx, err := s.db.BeginTx(context.Background(), nil)
if err != nil {
panic(err)
@ -246,7 +247,7 @@ func (s *sqliteStorage) MapHashes(hash ImageHash) {
}
}
func (s *sqliteStorage) DecodeHashes(hashes *SavedHashes) error {
func (s *sqliteStorage) DecodeHashes(hashes *ch.SavedHashes) error {
return nil
err := s.dropIndexes()
if err != nil {
@ -285,8 +286,8 @@ func (s *sqliteStorage) DecodeHashes(hashes *SavedHashes) error {
}
for _, savedHash := range hashes.Hashes {
s.mapHashes(tx, ImageHash{
Hashes: []Hash{savedHash.Hash},
s.mapHashes(tx, ch.ImageHash{
Hashes: []ch.Hash{savedHash.Hash},
ID: savedHash.ID,
})
}
@ -302,8 +303,8 @@ func (s *sqliteStorage) DecodeHashes(hashes *SavedHashes) error {
return nil
}
func (s *sqliteStorage) EncodeHashes() (*SavedHashes, error) {
hashes := SavedHashes{}
func (s *sqliteStorage) EncodeHashes() (*ch.SavedHashes, error) {
hashes := ch.SavedHashes{}
tx, err := s.db.Begin()
if err != nil {
return &hashes, err
@ -315,7 +316,7 @@ func (s *sqliteStorage) EncodeHashes() (*SavedHashes, error) {
}
for rows.Next() {
var (
hash SavedHash
hash ch.SavedHash
tmpHash int64
)
err = rows.Scan(&hash.Hash.Kind, &tmpHash, &hash.ID.Domain, &hash.ID.ID)
@ -331,11 +332,11 @@ func (s *sqliteStorage) EncodeHashes() (*SavedHashes, error) {
}
var (
previousEid int64 = -1
ids []ID
ids []ch.ID
)
for rows.Next() {
var (
id ID
id ch.ID
newEid int64
)
err = rows.Scan(&newEid, &id.Domain, &id.Domain)
@ -348,14 +349,14 @@ func (s *sqliteStorage) EncodeHashes() (*SavedHashes, error) {
if len(ids) > 1 {
hashes.IDs = append(hashes.IDs, ids)
}
ids = make([]ID, 0)
ids = make([]ch.ID, 0)
}
ids = append(ids, id)
}
return &hashes, nil
}
func (s *sqliteStorage) AssociateIDs(newIDs []NewIDs) error {
func (s *sqliteStorage) AssociateIDs(newIDs []ch.NewIDs) error {
tx, err := s.db.BeginTx(context.Background(), nil)
if err != nil {
panic(err)
@ -397,21 +398,21 @@ func (s *sqliteStorage) AssociateIDs(newIDs []NewIDs) error {
return nil
}
func (s *sqliteStorage) GetIDs(id ID) IDList {
var ids []ID
func (s *sqliteStorage) GetIDs(id ch.ID) ch.IDList {
var ids []ch.ID
rows, err := s.idMatchStatement.Query(id.Domain, id.ID)
if err != nil {
return nil
}
for rows.Next() {
var id ID
var id ch.ID
err = rows.Scan(&id.Domain, &id.ID)
if err != nil {
return nil
}
ids = append(ids, id)
}
return ToIDList(ids)
return ch.ToIDList(ids)
}
func (s *sqliteStorage) PrepareStatements() error {
@ -437,50 +438,50 @@ func (s *sqliteStorage) PrepareStatements() error {
return fmt.Errorf("failed to prepare database statements: %w", err)
}
s.hashExactMatchStatement, err = s.db.Prepare(`
select QIDs.domain, QIDs.stringid, IDs.domain, IDs.stringid from IDs
join IDsToEquivalantIDs as IEIDs on IDs.id=IEIDs.idid
join (
select QEIDs.id as id from EquivalentIDs as QEIDs
join IDsToEquivalantIDs as QIEIDs on QEIDs.id=QIEIDs.equivalentid
join IDs as QIDs on QIDs.id=QIEIDs.idid
join Hashes on Hashes.id=QIDs.id
where (Hashes.kind=? AND Hashes.hash=?)
) as EIDs on EIDs.id=IEIDs.equivalentid;
`)
select QIDs.domain, QIDs.stringid, IDs.domain, IDs.stringid from IDs
join IDsToEquivalantIDs as IEIDs on IDs.id=IEIDs.idid
join (
select QEIDs.id as id from EquivalentIDs as QEIDs
join IDsToEquivalantIDs as QIEIDs on QEIDs.id=QIEIDs.equivalentid
join IDs as QIDs on QIDs.id=QIEIDs.idid
join Hashes on Hashes.id=QIDs.id
where (Hashes.kind=? AND Hashes.hash=?)
) as EIDs on EIDs.id=IEIDs.equivalentid;
`)
if err != nil {
return fmt.Errorf("failed to prepare database statements: %w", err)
}
s.hashPartialMatchStatement, err = s.db.Prepare(`
select QIDs.domain, QIDs.stringid, EIDs.hash, IDs.domain, IDs.stringid from IDs
join IDsToEquivalantIDs as IEIDs on IDs.id=IEIDs.idid
join (
select Hashes.hash as hash, QEIDs.id as id from EquivalentIDs as QEIDs
join IDsToEquivalantIDs as QIEIDs on QEIDs.id=QIEIDs.equivalentid
join IDs as QIDs on QIDs.id=QIEIDs.idid
join Hashes on Hashes.id=QIDs.id
where (Hashes.kind=? AND (((Hashes.hash >> (0 * 8) & 0xFF)=(?2 >> (0 * 8) & 0xFF)) OR ((Hashes.hash >> (1 * 8) & 0xFF)=(?2 >> (1 * 8) & 0xFF)) OR ((Hashes.hash >> (2 * 8) & 0xFF)=(?2 >> (2 * 8) & 0xFF)) OR ((Hashes.hash >> (3 * 8) & 0xFF)=(?2 >> (3 * 8) & 0xFF)) OR ((Hashes.hash >> (4 * 8) & 0xFF)=(?2 >> (4 * 8) & 0xFF)) OR ((Hashes.hash >> (5 * 8) & 0xFF)=(?2 >> (5 * 8) & 0xFF)) OR ((Hashes.hash >> (6 * 8) & 0xFF)=(?2 >> (6 * 8) & 0xFF)) OR ((Hashes.hash >> (7 * 8) & 0xFF)=(?2 >> (7 * 8) & 0xFF))))
) as EIDs on EIDs.id=IEIDs.equivalentid;
`)
select QIDs.domain, QIDs.stringid, EIDs.hash, IDs.domain, IDs.stringid from IDs
join IDsToEquivalantIDs as IEIDs on IDs.id=IEIDs.idid
join (
select Hashes.hash as hash, QEIDs.id as id from EquivalentIDs as QEIDs
join IDsToEquivalantIDs as QIEIDs on QEIDs.id=QIEIDs.equivalentid
join IDs as QIDs on QIDs.id=QIEIDs.idid
join Hashes on Hashes.id=QIDs.id
where (Hashes.kind=? AND (((Hashes.hash >> (0 * 8) & 0xFF)=(?2 >> (0 * 8) & 0xFF)) OR ((Hashes.hash >> (1 * 8) & 0xFF)=(?2 >> (1 * 8) & 0xFF)) OR ((Hashes.hash >> (2 * 8) & 0xFF)=(?2 >> (2 * 8) & 0xFF)) OR ((Hashes.hash >> (3 * 8) & 0xFF)=(?2 >> (3 * 8) & 0xFF)) OR ((Hashes.hash >> (4 * 8) & 0xFF)=(?2 >> (4 * 8) & 0xFF)) OR ((Hashes.hash >> (5 * 8) & 0xFF)=(?2 >> (5 * 8) & 0xFF)) OR ((Hashes.hash >> (6 * 8) & 0xFF)=(?2 >> (6 * 8) & 0xFF)) OR ((Hashes.hash >> (7 * 8) & 0xFF)=(?2 >> (7 * 8) & 0xFF))))
) as EIDs on EIDs.id=IEIDs.equivalentid;
`)
if err != nil {
return fmt.Errorf("failed to prepare database statements: %w", err)
}
s.idMatchStatement, err = s.db.Prepare(`
select IDs.domain, IDs.stringid from IDs
join IDsToEquivalantIDs as IEIDs on IDs.id=IEIDs.idid
join (
select EIDs.* from EquivalentIDs as EIDs
join IDsToEquivalantIDs as QIEIDs on EIDs.id=QIEIDs.equivalentid
join IDs as QIDs on QIDs.id=QIEIDs.idid
where (QIDs.domain=? AND QIDs.stringid=?)
) as EIDs on EIDs.id=IEIDs.equivalentid;
`)
select IDs.domain, IDs.stringid from IDs
join IDsToEquivalantIDs as IEIDs on IDs.id=IEIDs.idid
join (
select EIDs.* from EquivalentIDs as EIDs
join IDsToEquivalantIDs as QIEIDs on EIDs.id=QIEIDs.equivalentid
join IDs as QIDs on QIDs.id=QIEIDs.idid
where (QIDs.domain=? AND QIDs.stringid=?)
) as EIDs on EIDs.id=IEIDs.equivalentid;
`)
if err != nil {
return fmt.Errorf("failed to prepare database statements: %w", err)
}
return nil
}
func NewSqliteStorage(db, path string) (HashStorage, error) {
func NewSqliteStorage(db, path string) (ch.HashStorage, error) {
sqlite := &sqliteStorage{}
sqlDB, err := sql.Open(db, fmt.Sprintf("file://%s?_pragma=cache_size(-200000)&_pragma=busy_timeout(500)&_pragma=hard_heap_limit(1073741824)&_pragma=journal_mode(wal)&_pragma=soft_heap_limit(314572800)", path))
if err != nil {
@ -488,34 +489,34 @@ func NewSqliteStorage(db, path string) (HashStorage, error) {
}
sqlite.db = sqlDB
_, err = sqlite.db.Exec(`
PRAGMA foreign_keys=ON;
CREATE TABLE IF NOT EXISTS IDs(
id INTEGER PRIMARY KEY,
stringid TEXT NOT NULL,
domain TEXT NOT NULL
);
PRAGMA foreign_keys=ON;
CREATE TABLE IF NOT EXISTS IDs(
id INTEGER PRIMARY KEY,
stringid TEXT NOT NULL,
domain TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS Hashes(
hash INTEGER NOT NULL,
kind INTEGER NOT NULL,
id INTEGER NOT NULL,
CREATE TABLE IF NOT EXISTS Hashes(
hash INTEGER NOT NULL,
kind INTEGER NOT NULL,
id INTEGER NOT NULL,
FOREIGN KEY(id) REFERENCES IDs(id)
);
FOREIGN KEY(id) REFERENCES IDs(id)
);
CREATE TABLE IF NOT EXISTS EquivalentIDs(
id integer primary key
);
CREATE TABLE IF NOT EXISTS EquivalentIDs(
id integer primary key
);
CREATE TABLE IF NOT EXISTS IDsToEquivalantIDs(
idid INTEGER NOT NULL,
equivalentid INTEGER NOT NULL,
PRIMARY KEY (idid, equivalentid),
CREATE TABLE IF NOT EXISTS IDsToEquivalantIDs(
idid INTEGER NOT NULL,
equivalentid INTEGER NOT NULL,
PRIMARY KEY (idid, equivalentid),
FOREIGN KEY(idid) REFERENCES IDs(id),
FOREIGN KEY(equivalentid) REFERENCES EquivalentIDs(id)
);
`)
FOREIGN KEY(idid) REFERENCES IDs(id),
FOREIGN KEY(equivalentid) REFERENCES EquivalentIDs(id)
);
`)
if err != nil {
panic(err)
}

View File

@ -1,6 +1,6 @@
//go:build cgo && !gokrazy
package ch
package storage
import (
_ "github.com/mattn/go-sqlite3"

View File

@ -1,6 +1,6 @@
//go:build !cgo && !gokrazy
package ch
package storage
import (
_ "github.com/ncruces/go-sqlite3/driver"

View File

@ -1,12 +1,13 @@
//go:build !gokrazy
package ch
package storage
import (
"errors"
"fmt"
"math/bits"
ch "gitea.narnian.us/lordwelch/comic-hasher"
"gitea.narnian.us/lordwelch/goimagehash"
"gonum.org/v1/gonum/spatial/vptree"
)
@ -15,14 +16,14 @@ type VPTree struct {
aTree *vptree.Tree
dTree *vptree.Tree
pTree *vptree.Tree
ids map[ID]*[]ID
ids map[ch.ID]*[]ch.ID
aHashes []vptree.Comparable // temporary, only used for vptree creation
dHashes []vptree.Comparable // temporary, only used for vptree creation
pHashes []vptree.Comparable // temporary, only used for vptree creation
}
type VPHash struct {
SavedHash
ch.SavedHash
}
func (h *VPHash) Distance(c vptree.Comparable) float64 {
@ -33,22 +34,22 @@ func (h *VPHash) Distance(c vptree.Comparable) float64 {
return float64(bits.OnesCount64(h.Hash.Hash ^ h2.Hash.Hash))
}
func (v *VPTree) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
func (v *VPTree) GetMatches(hashes []ch.Hash, max int, exactOnly bool) ([]ch.Result, error) {
var (
matches []Result
exactMatches []Result
tl timeLog
matches []ch.Result
exactMatches []ch.Result
tl ch.TimeLog
)
tl.resetTime()
defer tl.logTime("Search Complete")
tl.ResetTime()
defer tl.LogTime("Search Complete")
for _, hash := range hashes {
results := vptree.NewDistKeeper(float64(max))
currentTree := v.getCurrentTree(hash.Kind)
currentTree.NearestSet(results, &VPHash{SavedHash{Hash: hash}})
currentTree.NearestSet(results, &VPHash{ch.SavedHash{Hash: hash}})
mappedIds := map[*[]ID]bool{}
mappedIds := map[*[]ch.ID]bool{}
for _, result := range results.Heap {
storedHash := result.Comparable.(*VPHash)
ids := v.ids[storedHash.ID]
@ -57,14 +58,14 @@ func (v *VPTree) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, e
}
mappedIds[ids] = true
if result.Dist == 0 {
exactMatches = append(exactMatches, Result{
exactMatches = append(exactMatches, ch.Result{
Hash: storedHash.Hash,
ID: storedHash.ID,
Distance: 0,
EquivalentIDs: *v.ids[storedHash.ID],
})
} else {
matches = append(matches, Result{
matches = append(matches, ch.Result{
Hash: storedHash.Hash,
ID: storedHash.ID,
Distance: 0,
@ -93,11 +94,11 @@ func (v *VPTree) getCurrentTree(kind goimagehash.Kind) *vptree.Tree {
panic("Unknown hash type: " + kind.String())
}
func (v *VPTree) MapHashes(ImageHash) {
func (v *VPTree) MapHashes(ch.ImageHash) {
panic("Not Implemented")
}
func (v *VPTree) DecodeHashes(hashes *SavedHashes) error {
func (v *VPTree) DecodeHashes(hashes *ch.SavedHashes) error {
if hashes == nil {
return nil
}
@ -120,13 +121,13 @@ func (v *VPTree) DecodeHashes(hashes *SavedHashes) error {
v.pHashes = append(v.pHashes, &VPHash{savedHash})
}
if savedHash.ID == (ID{}) {
if savedHash.ID == (ch.ID{}) {
fmt.Println("Empty ID detected")
panic(savedHash)
}
// All known equal IDs are already mapped we can add any missing ones from hashes
if _, ok := v.ids[savedHash.ID]; !ok {
v.ids[savedHash.ID] = &[]ID{savedHash.ID}
v.ids[savedHash.ID] = &[]ch.ID{savedHash.ID}
}
}
@ -144,23 +145,23 @@ func (v *VPTree) DecodeHashes(hashes *SavedHashes) error {
}
return nil
}
func (v *VPTree) EncodeHashes() (*SavedHashes, error) {
return &SavedHashes{}, errors.New("Not Implemented")
func (v *VPTree) EncodeHashes() (*ch.SavedHashes, error) {
return &ch.SavedHashes{}, errors.New("Not Implemented")
}
func (v *VPTree) AssociateIDs(newIDs []NewIDs) error {
func (v *VPTree) AssociateIDs(newIDs []ch.NewIDs) error {
return errors.New("Not Implemented")
}
func (v *VPTree) GetIDs(id ID) IDList {
func (v *VPTree) GetIDs(id ch.ID) ch.IDList {
ids, found := v.ids[id]
if !found {
return nil
}
return ToIDList(*ids)
return ch.ToIDList(*ids)
}
func NewVPStorage() (HashStorage, error) {
func NewVPStorage() (ch.HashStorage, error) {
var err error
v := &VPTree{
aHashes: []vptree.Comparable{},

View File

@ -0,0 +1,13 @@
//go:build gokrazy
package storage
import (
"errors"
ch "gitea.narnian.us/lordwelch/comic-hasher"
)
func NewVPStorage() (ch.HashStorage, error) {
return nil, errors.New("VPTree not available")
}

View File

@ -5,17 +5,17 @@ import (
"time"
)
type timeLog struct {
type TimeLog struct {
total time.Duration
last time.Time
}
func (t *timeLog) resetTime() {
func (t *TimeLog) ResetTime() {
t.total = 0
t.last = time.Now()
}
func (t *timeLog) logTime(log string) {
func (t *TimeLog) LogTime(log string) {
now := time.Now()
diff := now.Sub(t.last)
t.last = now