diff --git a/BasicMap.go b/BasicMap.go index 04981f3..be1a815 100644 --- a/BasicMap.go +++ b/BasicMap.go @@ -14,155 +14,215 @@ import ( type basicMapStorage struct { hashMutex *sync.RWMutex - ids map[ID]*[]ID - hashes [3][]structHash + ids map[ID]*[]ID + aHashes []SavedHash + dHashes []SavedHash + pHashes []SavedHash } -type structHash struct { - hash uint64 - ids *[]ID -} +// atleast must have a read lock before using +func (b *basicMapStorage) atleast(kind goimagehash.Kind, maxDistance int, searchHash uint64) []Result { + matchingHashes := make([]Result, 0, 20) // hope that we don't need more -func (b *basicMapStorage) Atleast(hashKind goimagehash.Kind, maxDistance int, searchHash uint64) []Result { - hashType := int(hashKind) - 1 - matchingHashes := make([]Result, 0, 100) // hope that we don't need all of them - b.hashMutex.RLock() - defer b.hashMutex.RUnlock() - for _, storedHash := range b.hashes[hashType] { - distance := bits.OnesCount64(searchHash ^ storedHash.hash) + mappedIds := map[*[]ID]bool{} + for _, storedHash := range *b.getCurrentHashes(kind) { + distance := bits.OnesCount64(searchHash ^ storedHash.Hash.Hash) if distance <= maxDistance { - matchingHashes = append(matchingHashes, Result{ToIDList(*storedHash.ids), distance, Hash{storedHash.hash, hashKind}}) + ids := b.ids[storedHash.ID] + if mappedIds[ids] { + continue + } + mappedIds[ids] = true + matchingHashes = append(matchingHashes, Result{ToIDList(*b.ids[storedHash.ID]), distance, storedHash.Hash}) } } return matchingHashes } + func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) { - var foundMatches []Result - resetTime() - defer logTime(fmt.Sprintf("Search Complete: max: %v ExactOnly: %v", max, exactOnly)) + var ( + foundMatches []Result + tl timeLog + ) + tl.resetTime() + defer tl.logTime(fmt.Sprintf("Search Complete: max: %v ExactOnly: %v", max, exactOnly)) + b.hashMutex.RLock() + defer b.hashMutex.RUnlock() if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate for _, hash := range hashes { - hashType := int(hash.Kind) - 1 - b.hashMutex.RLock() - index, hashFound := b.findHash(hashType, hash.Hash) - if hashFound { - foundMatches = append(foundMatches, Result{ - Distance: 0, - Hash: hash, - IDs: ToIDList(*b.hashes[hashType][index].ids), - }) + mappedIds := map[*[]ID]bool{} + + index, count := b.findHash(hash) + if count > 0 { + for _, storedHash := range (*b.getCurrentHashes(hash.Kind))[index : index+count] { + ids := b.ids[storedHash.ID] + if mappedIds[ids] { + continue + } + mappedIds[ids] = true + + foundMatches = append(foundMatches, Result{ + Distance: 0, + Hash: storedHash.Hash, + IDs: ToIDList(*b.ids[storedHash.ID]), + }) + } } - b.hashMutex.RUnlock() + } - logTime("Search Exact") - // If we have exact matches don't bother with other matches - if len(foundMatches) > 0 && exactOnly { - return foundMatches, nil - } + tl.logTime("Search Exact") + + return foundMatches, nil } foundHashes := make(map[uint64]struct{}) totalPartialHashes := 0 + for _, hash := range hashes { - for _, match := range b.Atleast(hash.Kind, max, hash.Hash) { - _, alreadyMatched := foundHashes[match.Hash.Hash] - if alreadyMatched { - continue - } - foundHashes[match.Hash.Hash] = struct{}{} - foundMatches = append(foundMatches, match) - } + foundMatches = append(foundMatches, b.atleast(hash.Kind, max, hash.Hash)...) } fmt.Println("Total partial hashes tested:", totalPartialHashes, len(foundHashes)) return foundMatches, nil } -// findHash must have a read lock before using -func (b *basicMapStorage) findHash(hashType int, hash uint64) (int, bool) { - return slices.BinarySearchFunc(b.hashes[hashType], hash, func(e structHash, t uint64) int { - return cmp.Compare(e.hash, t) - }) +// getCurrentHashes must have a read lock before using +func (b *basicMapStorage) getCurrentHashes(kind goimagehash.Kind) *[]SavedHash { + if kind == goimagehash.AHash { + return &b.aHashes + } + if kind == goimagehash.DHash { + return &b.dHashes + } + if kind == goimagehash.PHash { + return &b.pHashes + } + panic("Unknown hash type: " + kind.String()) } -// insertHash will take a write lock if the hash is not found -func (b *basicMapStorage) insertHash(hashType int, hash uint64, ids *[]ID) { - b.hashMutex.RLock() - index, hashFound := b.findHash(hashType, hash) - b.hashMutex.RUnlock() - if hashFound { - return +// findHash must have a read lock before using +// return value is index, count +// if count < 1 then no results were found +func (b *basicMapStorage) findHash(hash Hash) (int, int) { + currentHashes := *b.getCurrentHashes(hash.Kind) + index, found := slices.BinarySearchFunc(currentHashes, hash, func(existing SavedHash, target Hash) int { + return cmp.Compare(existing.Hash.Hash, target.Hash) + }) + if !found { + return index, 0 + } + count := 0 + for i := index + 1; i < len(currentHashes) && currentHashes[i].Hash.Hash == hash.Hash; i++ { + count++ + } + return index, count +} + +// insertHash must already have a lock +func (b *basicMapStorage) insertHash(hash Hash, id ID) { + currentHashes := b.getCurrentHashes(hash.Kind) + index, count := b.findHash(hash) + max := index + count + for ; index < max; index++ { + if (*currentHashes)[index].ID == id { + return + } + } + + *currentHashes = slices.Insert(*currentHashes, index, SavedHash{hash, id}) + if _, mapped := b.ids[id]; !mapped { + b.ids[id] = &[]ID{id} } - b.hashMutex.Lock() - b.hashes[hashType] = slices.Insert(b.hashes[hashType], index, structHash{hash, ids}) - b.hashMutex.Unlock() } func (b *basicMapStorage) MapHashes(hash ImageHash) { + b.hashMutex.Lock() + defer b.hashMutex.Unlock() for _, ih := range hash.Hashes { - var ( - hashType = int(ih.Kind) - 1 - ) - b.hashMutex.RLock() - ids, ok := b.ids[hash.ID] - b.hashMutex.RUnlock() - if !ok { - b.hashMutex.Lock() - ids = &[]ID{hash.ID} - b.ids[hash.ID] = ids - b.hashMutex.Unlock() - } - - b.insertHash(hashType, ih.Hash, ids) + b.insertHash(ih, hash.ID) } } -// DecodeHashes should already have a lock +// DecodeHashes must already have a lock func (b *basicMapStorage) DecodeHashes(hashes SavedHashes) error { - for hashType, sourceHashes := range hashes.Hashes { - b.hashes[hashType] = make([]structHash, len(sourceHashes)) - for savedHash, idlistLocation := range sourceHashes { - b.hashes[hashType] = append(b.hashes[hashType], structHash{savedHash, &hashes.IDs[idlistLocation]}) - for _, id := range hashes.IDs[idlistLocation] { - b.ids[id] = &hashes.IDs[idlistLocation] - } + b.ids = make(map[ID]*[]ID, len(hashes.Hashes)) + + // Initialize all the known equal IDs + for _, ids := range hashes.IDs { + for _, id := range ids { + b.ids[id] = &ids } } - for hashType := range b.hashes { - slices.SortFunc(b.hashes[hashType], func(a, b structHash) int { - return cmp.Compare(a.hash, b.hash) - }) + + slices.SortFunc(hashes.Hashes, func(existing, target SavedHash) int { + return cmp.Or( + cmp.Compare(existing.Hash.Kind, target.Hash.Kind), + cmp.Compare(existing.Hash.Hash, target.Hash.Hash), + cmp.Compare(existing.ID.Domain, target.ID.Domain), + cmp.Compare(existing.ID.ID, target.ID.ID), + ) + }) + + // Assume they are probably fairly equally split between hash types + b.aHashes = make([]SavedHash, 0, len(hashes.Hashes)/3) + b.dHashes = make([]SavedHash, 0, len(hashes.Hashes)/3) + b.pHashes = make([]SavedHash, 0, len(hashes.Hashes)/3) + for _, savedHash := range hashes.Hashes { + + if savedHash.Hash.Kind == goimagehash.AHash { + b.aHashes = append(b.aHashes, savedHash) + } + if savedHash.Hash.Kind == goimagehash.DHash { + b.dHashes = append(b.dHashes, savedHash) + } + if savedHash.Hash.Kind == goimagehash.PHash { + b.pHashes = append(b.pHashes, savedHash) + } + + if savedHash.ID == (ID{}) { + fmt.Println("Empty ID detected") + panic(savedHash) + } + // All known equal IDs are already mapped we can add any missing ones from hashes + if _, ok := b.ids[savedHash.ID]; !ok { + b.ids[savedHash.ID] = &[]ID{savedHash.ID} + } } + + hashCmp := func(existing, target SavedHash) int { + return cmp.Or( + cmp.Compare(existing.Hash.Hash, target.Hash.Hash), + cmp.Compare(existing.ID.Domain, target.ID.Domain), + cmp.Compare(existing.ID.ID, target.ID.ID), + ) + } + slices.SortFunc(b.aHashes, hashCmp) + slices.SortFunc(b.dHashes, hashCmp) + slices.SortFunc(b.pHashes, hashCmp) + return nil } // EncodeHashes should already have a lock func (b *basicMapStorage) EncodeHashes() (SavedHashes, error) { - hashes := SavedHashes{ - Hashes: [3]map[uint64]int{ - make(map[uint64]int), - make(map[uint64]int), - make(map[uint64]int), - }, + savedHashes := SavedHashes{ + Hashes: make([]SavedHash, 0, len(b.aHashes)+len(b.dHashes)+len(b.pHashes)), } - idmap := map[*[]ID]int{} + // Only keep groups >1 as they will be mapped in SavedHashes.Hashes for _, ids := range b.ids { - if _, ok := idmap[ids]; ok { - continue + if len(*ids) > 1 { + savedHashes.IDs = append(savedHashes.IDs, *ids) } - idmap[ids] = len(hashes.IDs) - hashes.IDs = append(hashes.IDs, *ids) } - for hashType, hashToID := range b.hashes { - for _, hash := range hashToID { - hashes.Hashes[hashType][hash.hash] = idmap[hash.ids] - } - } - return hashes, nil + savedHashes.Hashes = append(savedHashes.Hashes, b.aHashes...) + savedHashes.Hashes = append(savedHashes.Hashes, b.dHashes...) + savedHashes.Hashes = append(savedHashes.Hashes, b.pHashes...) + + return savedHashes, nil } func (b *basicMapStorage) AssociateIDs(newids []NewIDs) error { @@ -171,7 +231,7 @@ func (b *basicMapStorage) AssociateIDs(newids []NewIDs) error { ids, found := b.ids[newid.OldID] b.hashMutex.RUnlock() if !found { - msg := "No IDs belonging to " + string(newid.OldID.Domain) + " exist on this server" + msg := "ID not found on this server" return errors.New(msg) } b.hashMutex.Lock() @@ -195,7 +255,9 @@ func NewBasicMapStorage() (HashStorage, error) { storage := &basicMapStorage{ hashMutex: &sync.RWMutex{}, ids: make(map[ID]*[]ID), - hashes: [3][]structHash{}, + aHashes: []SavedHash{}, + dHashes: []SavedHash{}, + pHashes: []SavedHash{}, } return storage, nil } diff --git a/CHDB.go b/CHDB.go index 5b39075..6af4a27 100644 --- a/CHDB.go +++ b/CHDB.go @@ -1,108 +1,11 @@ package ch -import ( - "database/sql" - "fmt" - "log" - "os" - "path/filepath" - - _ "modernc.org/sqlite" -) - -type CHDB struct { - comicvinePath string - sql *sql.DB - deleteExisting bool -} - -func OpenCHDB(path string, comicvinePath string, deleteExisting bool) (CHDB, error) { - path, _ = filepath.Abs(path) - err := os.MkdirAll(filepath.Dir(path), 0o755) - if err != nil { - panic("Unable to create directory " + filepath.Dir(path)) - } - println(fmt.Sprintf("file://%s?&_pragma=busy_timeout(500)&_pragma=journal_mode(wal)", path)) - sql, err := sql.Open("sqlite", fmt.Sprintf("file://%s?&_pragma=busy_timeout(500)&_pragma=journal_mode(wal)", path)) - if err != nil { - return CHDB{comicvinePath, sql, deleteExisting}, fmt.Errorf("Failed to open database: %w", err) - } - err = sql.Ping() - if err != nil { - return CHDB{comicvinePath, sql, deleteExisting}, fmt.Errorf("Failed to open database: %w", err) - } - _, err = sql.Exec(` -CREATE TABLE IF NOT EXISTS paths( - path STRING PRIMARY KEY -); -CREATE TABLE IF NOT EXISTS bad_urls( - url STRING PRIMARY KEY -); -`) - if err != nil { - err = fmt.Errorf("Failed to create table: %w", err) - } - return CHDB{comicvinePath, sql, deleteExisting}, err -} - -func (s CHDB) PathHashed(path string) bool { - path, _ = filepath.Rel(s.comicvinePath, path) - dbPath := "" - - if s.deleteExisting { - _ = s.sql.QueryRow("SELECT path FROM paths where path=?", path).Scan(&dbPath) - - if dbPath == path { - os.Remove(filepath.Join(s.comicvinePath, path)) - } - return dbPath == path - } - count := 0 - _ = s.sql.QueryRow("SELECT count(path) FROM paths where path=?", path).Scan(&count) - return count > 0 -} - -func (s CHDB) PathDownloaded(path string) bool { - relPath, _ := filepath.Rel(s.comicvinePath, path) - - count := 0 - _ = s.sql.QueryRow("SELECT count(path) FROM paths where path=?", relPath).Scan(&count) - if count != 1 { - f, err := os.Open(path) - if err == nil { - defer f.Close() - } - return !os.IsNotExist(err) - } - return true -} - -func (s CHDB) AddPath(path string) { - relPath, _ := filepath.Rel(s.comicvinePath, path) - _, err := s.sql.Exec("INSERT INTO paths VALUES(?) ON CONFLICT DO NOTHING", relPath) - if err != nil { - log.Println(fmt.Errorf("Failed to insert %v into paths: %w", relPath, err)) - } - - if s.deleteExisting { - _ = os.Remove(path) - _ = RmdirP(filepath.Dir(path)) - } -} - -func (s CHDB) CheckURL(url string) bool { - count := 0 - _ = s.sql.QueryRow("SELECT count(url) FROM bad_urls where url=?", url).Scan(&count) - return count > 0 -} - -func (s CHDB) AddURL(url string) { - _, err := s.sql.Exec("INSERT INTO bad_urls VALUES(?) ON CONFLICT DO NOTHING", url) - if err != nil { - log.Println(fmt.Errorf("Failed to insert %v into bad_urls: %w", url, err)) - } -} - -func (s CHDB) Close() error { - return s.sql.Close() +type CHDB interface { + // OpenCHDB(path string, comicvinePath string, deleteExisting bool) (CHDB, error) + PathHashed(path string) bool + PathDownloaded(path string) bool + AddPath(path string) + CheckURL(url string) bool + AddURL(url string) + Close() error } diff --git a/CHDB_bolt.go b/CHDB_bolt.go new file mode 100644 index 0000000..770e0f0 --- /dev/null +++ b/CHDB_bolt.go @@ -0,0 +1,177 @@ +package ch + +import ( + "fmt" + "log" + "os" + "path/filepath" + "slices" + + bolt "go.etcd.io/bbolt" +) + +type CHDBBolt struct { + comicvinePath string + db *bolt.DB + deleteExisting bool +} + +func OpenCHDBBolt(path string, comicvinePath string, deleteExisting bool) (CHDBBolt, error) { + path, _ = filepath.Abs(path) + err := os.MkdirAll(filepath.Dir(path), 0o755) + if err != nil { + panic("Unable to create directory " + filepath.Dir(path)) + } + db, err := bolt.Open(path, 0o644, nil) + if err != nil { + return CHDBBolt{comicvinePath, db, deleteExisting}, fmt.Errorf("failed to open database: %w", err) + } + err = db.Update(func(tx *bolt.Tx) error { + + _, err = tx.CreateBucketIfNotExists([]byte("paths")) + if err != nil { + return fmt.Errorf("failed to create bucket %v: %w", "paths", err) + } + _, err = tx.CreateBucketIfNotExists([]byte("bad_urls")) + if err != nil { + return fmt.Errorf("failed to create bucket %v: %w", "paths", err) + } + return nil + }) + if err != nil { + db.Close() + return CHDBBolt{comicvinePath, db, deleteExisting}, fmt.Errorf("failed to init database: %w", err) + } + + return CHDBBolt{comicvinePath, db, deleteExisting}, nil +} + +func (c CHDBBolt) Import(paths []string, bad_urls []string) { + slices.Sort(paths) + slices.Sort(bad_urls) + c.db.Update(func(tx *bolt.Tx) error { + p := tx.Bucket([]byte("paths")) + b := tx.Bucket([]byte("bad_urls")) + + for _, path := range paths { + p.Put([]byte(path), []byte{}) + } + for _, url := range bad_urls { + b.Put([]byte(url), []byte{}) + } + return nil + }) +} + +func (c CHDBBolt) Dump() (paths []string, bad_urls []string) { + + c.db.View(func(tx *bolt.Tx) error { + p := tx.Bucket([]byte("paths")) + b := tx.Bucket([]byte("bad_urls")) + paths = make([]string, 0, p.Inspect().KeyN) + bad_urls = make([]string, 0, b.Inspect().KeyN) + b.ForEach(func(k, v []byte) error { + bad_urls = append(bad_urls, string(k)+"") + return nil + }) + p.ForEach(func(k, v []byte) error { + paths = append(paths, string(k)+"") + return nil + }) + return nil + }) + return paths, bad_urls +} + +func (c CHDBBolt) PathHashed(path string) bool { + path, _ = filepath.Rel(c.comicvinePath, path) + + tx, err := c.db.Begin(false) + if err != nil { + return false + } + defer tx.Rollback() + b := tx.Bucket([]byte("paths")) + dbRes := b.Get([]byte(path)) + if dbRes != nil { + if c.deleteExisting { + os.Remove(filepath.Join(c.comicvinePath, path)) + } + return true + } + + return false +} + +func (c CHDBBolt) PathDownloaded(path string) bool { + relPath, _ := filepath.Rel(c.comicvinePath, path) + + tx, err := c.db.Begin(false) + if err != nil { + return false + } + defer tx.Rollback() + b := tx.Bucket([]byte("paths")) + dbRes := b.Get([]byte(relPath)) + if dbRes == nil { + + f, err := os.Open(path) + if err == nil { + defer f.Close() + } + return !os.IsNotExist(err) + } + return true +} + +func (c CHDBBolt) AddPath(path string) { + relPath, _ := filepath.Rel(c.comicvinePath, path) + + tx, err := c.db.Begin(true) + if err != nil { + c.db.Logger().Errorf("Failed to open transaction: %v", err) + } + defer tx.Rollback() + b := tx.Bucket([]byte("paths")) + + err = b.Put([]byte(relPath), []byte{}) + if err != nil { + log.Println(fmt.Errorf("Failed to insert %v (%v) into paths: %w", path, relPath, err)) + } + tx.Commit() + if c.deleteExisting { + _ = os.Remove(path) + _ = RmdirP(filepath.Dir(path)) + } +} + +func (c CHDBBolt) CheckURL(url string) bool { + + tx, err := c.db.Begin(true) + if err != nil { + return false + } + defer tx.Rollback() + b := tx.Bucket([]byte("bad_urls")) + return b.Get([]byte(url)) != nil +} + +func (c CHDBBolt) AddURL(url string) { + + tx, err := c.db.Begin(true) + if err != nil { + c.db.Logger().Errorf("Failed to open transaction: %v", err) + } + defer tx.Rollback() + b := tx.Bucket([]byte("bad_urls")) + + err = b.Put([]byte(url), []byte{}) + if err != nil { + log.Println(fmt.Errorf("Failed to insert %v into bad_urls: %w", url, err)) + } + tx.Commit() +} + +func (c CHDBBolt) Close() error { + return c.db.Close() +} diff --git a/CHDB_sqlite.go b/CHDB_sqlite.go new file mode 100644 index 0000000..c0c70ef --- /dev/null +++ b/CHDB_sqlite.go @@ -0,0 +1,142 @@ +package ch + +import ( + "database/sql" + "fmt" + "log" + "os" + "path/filepath" + + _ "modernc.org/sqlite" +) + +type CHDBSqlite struct { + comicvinePath string + sql *sql.DB + deleteExisting bool +} + +func OpenCHDBSqlite(path string, comicvinePath string, deleteExisting bool) (CHDBSqlite, error) { + path, _ = filepath.Abs(path) + err := os.MkdirAll(filepath.Dir(path), 0o755) + if err != nil { + panic("Unable to create directory " + filepath.Dir(path)) + } + println(fmt.Sprintf("file://%s?&_pragma=busy_timeout(500)&_pragma=journal_mode(wal)", path)) + sql, err := sql.Open("sqlite", fmt.Sprintf("file://%s?&_pragma=busy_timeout(500)&_pragma=journal_mode(wal)", path)) + if err != nil { + return CHDBSqlite{comicvinePath, sql, deleteExisting}, fmt.Errorf("Failed to open database: %w", err) + } + err = sql.Ping() + if err != nil { + return CHDBSqlite{comicvinePath, sql, deleteExisting}, fmt.Errorf("Failed to open database: %w", err) + } + _, err = sql.Exec(` +CREATE TABLE IF NOT EXISTS paths( + path STRING PRIMARY KEY +); +CREATE TABLE IF NOT EXISTS bad_urls( + url STRING PRIMARY KEY +); +`) + if err != nil { + err = fmt.Errorf("Failed to create table: %w", err) + } + return CHDBSqlite{comicvinePath, sql, deleteExisting}, err +} + +func (s CHDBSqlite) Dump() (paths []string, bad_urls []string) { + + rows, err := s.sql.Query("SELECT path from paths") + if err != nil { + panic(err) + } + + for rows.Next() { + var value string + err = rows.Scan(&value) + if err != nil { + panic(err) + } + paths = append(paths, value) + } + rows.Close() + + rows, err = s.sql.Query("SELECT url from bad_urls") + if err != nil { + panic(err) + } + + for rows.Next() { + var value string + err = rows.Scan(&value) + if err != nil { + panic(err) + } + bad_urls = append(bad_urls, value) + } + rows.Close() + return paths, bad_urls +} + +func (s CHDBSqlite) PathHashed(path string) bool { + path, _ = filepath.Rel(s.comicvinePath, path) + dbPath := "" + + if s.deleteExisting { + _ = s.sql.QueryRow("SELECT path FROM paths where path=?", path).Scan(&dbPath) + + if dbPath == path { + os.Remove(filepath.Join(s.comicvinePath, path)) + } + return dbPath == path + } + count := 0 + _ = s.sql.QueryRow("SELECT count(path) FROM paths where path=?", path).Scan(&count) + return count > 0 +} + +func (s CHDBSqlite) PathDownloaded(path string) bool { + relPath, _ := filepath.Rel(s.comicvinePath, path) + + count := 0 + _ = s.sql.QueryRow("SELECT count(path) FROM paths where path=?", relPath).Scan(&count) + if count != 1 { + f, err := os.Open(path) + if err == nil { + defer f.Close() + } + return !os.IsNotExist(err) + } + return true +} + +func (s CHDBSqlite) AddPath(path string) { + relPath, _ := filepath.Rel(s.comicvinePath, path) + _, err := s.sql.Exec("INSERT INTO paths VALUES(?) ON CONFLICT DO NOTHING", relPath) + if err != nil { + log.Println(fmt.Errorf("Failed to insert %v into paths: %w", relPath, err)) + } + + if s.deleteExisting { + _ = os.Remove(path) + _ = RmdirP(filepath.Dir(path)) + } +} + +func (s CHDBSqlite) CheckURL(url string) bool { + count := 0 + _ = s.sql.QueryRow("SELECT count(url) FROM bad_urls where url=?", url).Scan(&count) + return count > 0 +} + +func (s CHDBSqlite) AddURL(url string) { + _, err := s.sql.Exec("INSERT INTO bad_urls VALUES(?) ON CONFLICT DO NOTHING", url) + if err != nil { + log.Println(fmt.Errorf("Failed to insert %v into bad_urls: %w", url, err)) + } +} + +func (s CHDBSqlite) Close() error { + return s.sql.Close() +} diff --git a/cmd/bolt-migrate/main.go b/cmd/bolt-migrate/main.go new file mode 100644 index 0000000..9e97144 --- /dev/null +++ b/cmd/bolt-migrate/main.go @@ -0,0 +1,31 @@ +package main + +import ( + "fmt" + "os" + + ch "gitea.narnian.us/lordwelch/comic-hasher" +) + +func main() { + fmt.Printf("cv path: %s Sqlite path: %s Bolt path: %s\n", os.Args[1], os.Args[2], os.Args[3]) + sql, err := ch.OpenCHDBSqlite(os.Args[2], os.Args[1], false) + if err != nil { + panic(err) + } + db, err := ch.OpenCHDBBolt(os.Args[3], os.Args[1], false) + if err != nil { + panic(err) + } + paths, bad_urls := sql.Dump() + fmt.Printf("Dumped %d %d", len(paths), len(bad_urls)) + db.Import(paths, bad_urls) + // for _, path := range paths { + // db.AddPath(filepath.Join(os.Args[1], path)) + // } + // for _, url := range bad_urls { + // db.AddURL(url) + // } + sql.Close() + db.Close() +} diff --git a/cmd/comic-hasher/main.go b/cmd/comic-hasher/main.go index e3583fb..074a6d2 100644 --- a/cmd/comic-hasher/main.go +++ b/cmd/comic-hasher/main.go @@ -29,10 +29,9 @@ import ( "sync" "time" + "github.com/disintegration/imaging" "github.com/kr/pretty" - "github.com/vmihailenco/msgpack/v5" - _ "golang.org/x/image/tiff" _ "golang.org/x/image/vp8" _ "golang.org/x/image/vp8l" @@ -57,23 +56,6 @@ type Server struct { onlyHashNewIDs bool } -type Format int - -const ( - Msgpack = iota + 1 - JSON -) - -var formatNames = map[Format]string{ - JSON: "json", - Msgpack: "msgpack", -} - -var formatValues = map[string]Format{ - "json": JSON, - "msgpack": Msgpack, -} - var bufPool = &sync.Pool{ New: func() any { // The Pool's New function should generally only return pointer @@ -83,22 +65,6 @@ var bufPool = &sync.Pool{ }, } -func (f Format) String() string { - if name, known := formatNames[f]; known { - return name - } - return "Unknown" -} - -func (f *Format) Set(s string) error { - if format, known := formatValues[strings.ToLower(s)]; known { - *f = format - } else { - return fmt.Errorf("Unknown format: %d", f) - } - return nil -} - type Storage int const ( @@ -141,8 +107,6 @@ func (f *Storage) Set(s string) error { return nil } -type Encoder func(any) ([]byte, error) -type Decoder func([]byte, interface{}) error type CVOpts struct { downloadCovers bool APIKey string @@ -158,7 +122,7 @@ type Opts struct { sqlitePath string loadEmbeddedHashes bool saveEmbeddedHashes bool - format Format + format ch.Format hashesPath string storageType Storage onlyHashNewIDs bool @@ -169,7 +133,7 @@ type Opts struct { } func main() { - opts := Opts{format: Msgpack, storageType: BasicMap} // flag is weird + opts := Opts{format: ch.Msgpack, storageType: BasicMap} // flag is weird wd, err := os.Getwd() fmt.Println(err) if err != nil { @@ -208,13 +172,11 @@ func main() { panic(err) } } - // opts.onlyHashNewIDs = opts.onlyHashNewIDs || opts.deleteHashedImages if opts.cv.downloadCovers { if opts.cv.APIKey == "" { log.Fatal("No ComicVine API Key provided") } } - opts.cv.thumbOnly = opts.cv.thumbOnly || (opts.onlyHashNewIDs && (opts.deleteHashedImages || !opts.cv.keepDownloaded)) opts.path, _ = filepath.Abs(opts.path) if opts.hashesPath == "" { opts.hashesPath = filepath.Join(opts.path, "hashes.gz") @@ -230,9 +192,7 @@ func main() { opts.cv.path, _ = filepath.Abs(opts.cv.path) pretty.Log(opts) - if !opts.cv.keepDownloaded && opts.onlyHashNewIDs { - panic("You need to fix your -cv-keep-downloaded and -only-hash-new-ids flags") - } + // TODO: Fix options startServer(opts) } @@ -553,9 +513,7 @@ func (s *Server) hasher(workerID int, done func(int)) { } select { - case <-s.Context.Done(): - log.Println("Recieved quit") - return + // TODO: Check channel pipelines case s.mappingQueue <- hash: default: } @@ -589,59 +547,12 @@ func (s *Server) reader(workerID int, done func(i int)) { NewOnly: s.onlyHashNewIDs, } select { - case <-s.Context.Done(): - log.Println("Recieved quit") - return case s.hashingQueue <- im: default: } } } -// EncodeHashes must have a lock to s.hashMutex -func (s *Server) EncodeHashes(format Format) ([]byte, error) { - var encoder Encoder - switch format { - case Msgpack: - encoder = msgpack.Marshal - case JSON: - encoder = json.Marshal - default: - return nil, fmt.Errorf("Unknown format: %v", format) - } - hashes, err := s.hashes.EncodeHashes() - if err != nil { - return nil, err - } - return encoder(hashes) -} - -// DecodeHashes must have a lock to s.hashMutex -func (s *Server) DecodeHashes(format Format, hashes []byte) error { - var decoder Decoder - switch format { - case Msgpack: - decoder = msgpack.Unmarshal - case JSON: - decoder = json.Unmarshal - - default: - return fmt.Errorf("Unknown format: %v", format) - } - loadedHashes := ch.SavedHashes{} - err := decoder(hashes, &loadedHashes) - if err != nil || len(loadedHashes.Hashes[0]) == 0 { - fmt.Println("Failed to load hashes, checking if they are old hashes", format, ":", err) - oldHashes := make(ch.OldSavedHashes) - if err = decoder(hashes, &oldHashes); err != nil { - return err - } - loadedHashes = ch.ConvertSavedHashes(oldHashes) - } - - return s.hashes.DecodeHashes(loadedHashes) -} - func (s *Server) HashLocalImages(opts Opts) { if opts.coverPath == "" { return @@ -700,28 +611,17 @@ func initializeStorage(opts Opts) (ch.HashStorage, error) { return nil, errors.New("Unknown storage type provided") } -func loadHashes(opts Opts, decodeHashes func(format Format, hashes []byte) error) { +func loadHashes(opts Opts) *ch.SavedHashes { + var hashes []byte if opts.loadEmbeddedHashes && len(ch.Hashes) != 0 { fmt.Println("Loading embedded hashes") - var err error - hashes := ch.Hashes + hashes = ch.Hashes if gr, err := gzip.NewReader(bytes.NewReader(ch.Hashes)); err == nil { hashes, err = io.ReadAll(gr) if err != nil { panic(fmt.Sprintf("Failed to read embedded hashes: %s", err)) } } - - var format Format - for _, format = range []Format{Msgpack, JSON} { - if err = decodeHashes(format, hashes); err == nil { - break - } - } - if err != nil { - panic(fmt.Sprintf("Failed to decode embedded hashes: %s", err)) - } - fmt.Printf("Loaded embedded %s hashes\n", format) } else { fmt.Println("Loading saved hashes") if f, err := os.Open(opts.hashesPath); err == nil { @@ -731,64 +631,67 @@ func loadHashes(opts Opts, decodeHashes func(format Format, hashes []byte) error } else { _, _ = f.Seek(0, io.SeekStart) } - hashes, err := io.ReadAll(buf) + hashes, err = io.ReadAll(buf) f.Close() if err != nil { panic(fmt.Sprintf("Failed to load hashes from disk: %s", err)) } - - var format Format - for _, format = range []Format{Msgpack, JSON} { - if err = decodeHashes(format, hashes); err == nil { - break - } - } - - if err != nil { - panic(fmt.Sprintf("Failed to decode hashes from disk: %s", err)) - } - fmt.Printf("Loaded %s hashes from %q\n", format, opts.hashesPath) } else { if errors.Is(err, os.ErrNotExist) { log.Println("No saved hashes to load") } else { log.Println("Unable to load saved hashes", err) } + return nil } } + + var ( + format ch.Format + loadedHashes *ch.SavedHashes + err error + ) + for _, format = range []ch.Format{ch.Msgpack, ch.JSON} { + if loadedHashes, err = ch.DecodeHashes(format, hashes); errors.Is(err, ch.DecodeError) { + continue + } + break + } + if err != nil { + panic(fmt.Sprintf("Failed to decode hashes: %s", err)) + } + fmt.Printf("Loaded %s hashes\n", format) + return loadedHashes } -func saveHashes(opts Opts, encodeHashes func(format Format) ([]byte, error)) { - if !opts.loadEmbeddedHashes || opts.saveEmbeddedHashes { - encodedHashes, err := encodeHashes(opts.format) - if err == nil { - if f, err := os.Create(opts.hashesPath); err == nil { - failed := false - gzw := gzip.NewWriter(f) - _, err := gzw.Write(encodedHashes) - if err != nil { - log.Println("Failed to write hashes", err) - failed = true - } - err = gzw.Close() - if err != nil { - log.Println("Failed to write hashes", err) - failed = true - } - err = f.Close() - if err != nil { - log.Println("Failed to write hashes", err) - failed = true - } - if !failed { - log.Println("Successfully saved hashes") - } - } else { - log.Println("Unabled to save hashes", err) - } - } else { - fmt.Printf("Unable to encode hashes as %v: %v", opts.format, err) - } +func saveHashes(opts Opts, hashes ch.SavedHashes) error { + if opts.loadEmbeddedHashes && !opts.saveEmbeddedHashes { + return errors.New("refusing to save embedded hashes") } + + encodedHashes, err := ch.EncodeHashes(hashes, opts.format) + if err != nil { + return fmt.Errorf("unable to encode hashes as %v: %w", opts.format, err) + } + f, err := os.Create(opts.hashesPath) + if err != nil { + return fmt.Errorf("unabled to save hashes: %w", err) + } + + gzw := gzip.NewWriter(f) + + if _, err = gzw.Write(encodedHashes); err != nil { + return fmt.Errorf("failed to write hashes: %w", err) + } + + if err = gzw.Close(); err != nil { + return fmt.Errorf("failed to write hashes: %w", err) + } + + if err = f.Close(); err != nil { + return fmt.Errorf("failed to write hashes: %w", err) + } + log.Println("Successfully saved hashes") + return nil } func downloadProcessor(chdb ch.CHDB, opts Opts, imagePaths chan cv.Download, server Server) { @@ -803,7 +706,6 @@ func downloadProcessor(chdb ch.CHDB, opts Opts, imagePaths chan cv.Download, ser } if chdb.PathHashed(path.Dest) { - // log.Println(path.Dest, "File has already been hashed, it may not be saved in the hashes file because we currently don't save any hashes if we've crashed") continue } var ( @@ -832,7 +734,7 @@ func downloadProcessor(chdb ch.CHDB, opts Opts, imagePaths chan cv.Download, ser } continue // skip this image } - chdb.AddPath(path.Dest) // Add to sqlite db and remove file if opts.deleteHashedImages is true + chdb.AddPath(path.Dest) // Add to db and remove file if opts.deleteHashedImages is true im := ch.Im{ Im: i, @@ -845,6 +747,7 @@ func downloadProcessor(chdb ch.CHDB, opts Opts, imagePaths chan cv.Download, ser } func startServer(opts Opts) { + imaging.SetMaxProcs(2) if opts.cpuprofile != "" { f, err := os.Create(opts.cpuprofile) if err != nil { @@ -904,31 +807,37 @@ func startServer(opts Opts) { mwg.Add(1) go server.mapper(func() { log.Println("Mapper 0 completed"); mwg.Done() }) - // server.DecodeHashes would normally need a write lock + // DecodeHashes would normally need a write lock // nothing else has been started yet so we don't need one - loadHashes(opts, server.DecodeHashes) + if err := server.hashes.DecodeHashes(*loadHashes(opts)); err != nil { + panic(err) + } server.HashLocalImages(opts) - chdb, err := ch.OpenCHDB(filepath.Join(opts.path, "ch.sqlite"), opts.cv.path, opts.deleteHashedImages) + chdb, err := ch.OpenCHDBBolt(filepath.Join(opts.path, "chdb.bolt"), opts.cv.path, opts.deleteHashedImages) if err != nil { panic(err) } log.Println("Init downloaders") dwg := sync.WaitGroup{} + dcwg := sync.WaitGroup{} finishedDownloadQueue := make(chan cv.Download, 1) - go downloadProcessor(chdb, opts, finishedDownloadQueue, server) + dcwg.Add(1) + go func() { + defer dcwg.Done() + downloadProcessor(chdb, opts, finishedDownloadQueue, server) + }() if opts.cv.downloadCovers { dwg.Add(1) imageTypes := []string{} if opts.cv.thumbOnly { imageTypes = append(imageTypes, "thumb_url") - } - if opts.cv.originalOnly { + } else if opts.cv.originalOnly { imageTypes = append(imageTypes, "original_url") } - cvdownloader := cv.NewCVDownloader(server.Context, bufPool, chdb, opts.cv.path, opts.cv.APIKey, imageTypes, opts.cv.keepDownloaded, opts.cv.hashDownloaded, finishedDownloadQueue) + cvdownloader := cv.NewCVDownloader(server.Context, bufPool, opts.onlyHashNewIDs, server.hashes.GetIDs, chdb, opts.cv.path, opts.cv.APIKey, imageTypes, opts.cv.keepDownloaded, opts.cv.hashDownloaded, finishedDownloadQueue) go func() { defer dwg.Done() cv.DownloadCovers(cvdownloader) @@ -954,7 +863,8 @@ func startServer(opts Opts) { close(server.readerQueue) log.Println("waiting on readers") rwg.Wait() - for range server.readerQueue { + for dw := range server.readerQueue { + fmt.Println("Skipping read", dw) } log.Println("waiting on downloaders") @@ -962,28 +872,39 @@ func startServer(opts Opts) { log.Println("waiting on downloader") close(finishedDownloadQueue) - for range finishedDownloadQueue { + dcwg.Wait() // Wait for the download processor to finish + for dw := range finishedDownloadQueue { + fmt.Println("Skipping download", dw.IssueID) } // close(server.hashingQueue) // Closed by downloadProcessor log.Println("waiting on hashers") hwg.Wait() - for range server.hashingQueue { + for dw := range server.hashingQueue { + fmt.Println("Skipping hashing", dw.ID) } close(server.mappingQueue) log.Println("waiting on mapper") mwg.Wait() - for range server.mappingQueue { + for dw := range server.mappingQueue { + fmt.Println("Skipping mapping", dw.ID) } close(server.signalQueue) - for range server.signalQueue { + for dw := range server.signalQueue { + fmt.Println("Skipping", dw) } _ = chdb.Close() // server.EncodeHashes would normally need a read lock // the server has been stopped so it's not needed here - saveHashes(opts, server.EncodeHashes) + hashes, err := server.hashes.EncodeHashes() + if err != nil { + panic(fmt.Errorf("Failed to save hashes: %w", err)) + } + if err = saveHashes(opts, hashes); err != nil { + panic(err) + } } diff --git a/cv/cv.go b/cv/cv.go index 8bc435f..7292b2f 100644 --- a/cv/cv.go +++ b/cv/cv.go @@ -73,14 +73,16 @@ type CVDownloader struct { Context context.Context FinishedDownloadQueue chan Download - fileList []string - totalResults int - imageWG sync.WaitGroup - downloadQueue chan *CVResult - imageDownloads chan download - notFound chan download - chdb ch.CHDB - bufPool *sync.Pool + fileList []string + totalResults int + imageWG sync.WaitGroup + downloadQueue chan *CVResult + imageDownloads chan download + notFound chan download + chdb ch.CHDB + bufPool *sync.Pool + get_id func(id ch.ID) ch.IDList + only_hash_new_ids bool } var ( @@ -128,8 +130,8 @@ func (c *CVDownloader) loadIssues(filename string) (*CVResult, error) { return tmp, nil } -func Get(ctx context.Context, url string) (*http.Response, error, func()) { - ctx, cancel := context.WithTimeout(ctx, time.Second*20) +func Get(url string) (*http.Response, error, func()) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*20) req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, err, cancel @@ -144,7 +146,7 @@ func getOffset(name string) int { } // updateIssues c.downloadQueue must not be closed before this function has returned -func (c *CVDownloader) updateIssues() { +func (c *CVDownloader) updateIssues() int { base_url, err := url.Parse("https://comicvine.gamespot.com/api/issues/?sort=date_added,id:asc&format=json&field_list=id,image,volume") if err != nil { log.Fatal(err) @@ -183,7 +185,7 @@ func (c *CVDownloader) updateIssues() { for offset = 0; offset <= c.totalResults; offset += 100 { index := offset / 100 if c.hasQuit() { - return + return offset - 100 } if index < len(c.fileList) { if getOffset(c.fileList[index]) == offset { // If it's in order and it's not missing it should be here @@ -195,7 +197,7 @@ func (c *CVDownloader) updateIssues() { if c.totalResults == issue.Offset+issue.NumberOfPageResults { if index != len(c.fileList)-1 { log.Printf("Wrong index: expected %d got %d", len(c.fileList), index) - return + return offset - 100 } log.Println("Deleting the last page to detect new comics") os.Remove(filepath.Join(c.JSONPath, c.fileList[index])) @@ -231,7 +233,7 @@ func (c *CVDownloader) updateIssues() { if c.totalResults == issue.Offset+issue.NumberOfPageResults { if index != len(c.fileList)-1 { log.Printf("Wrong index: expected %d got %d", len(c.fileList), index) - return + return offset - 100 } log.Println("Deleting the last page to detect new comics") os.Remove(filepath.Join(c.JSONPath, c.fileList[index])) @@ -255,17 +257,17 @@ func (c *CVDownloader) updateIssues() { select { case <-c.Context.Done(): // allows us to return immediately even during a timeout - return + return offset - 100 case <-time.After(10 * time.Second): } - resp, err, cancelDownloadCTX := Get(c.Context, URI.String()) + resp, err, cancelDownloadCTX := Get(URI.String()) if err != nil { cancelDownloadCTX() if retry(URI.String(), err) { continue } // Fail and let comic-hasher try the whole thing again later - return + return offset - 100 } if resp.StatusCode != 200 { cancelDownloadCTX() @@ -277,7 +279,7 @@ func (c *CVDownloader) updateIssues() { select { case <-c.Context.Done(): // allows us to return immediately even during a timeout _ = resp.Body.Close() - return + return offset - 100 case <-time.After(1 * time.Hour): } } @@ -295,7 +297,7 @@ func (c *CVDownloader) updateIssues() { if retry(URI.String(), err) { continue } - return + return offset - 100 } cancelDownloadCTX() if issue.NumberOfTotalResults > c.totalResults { @@ -303,15 +305,13 @@ func (c *CVDownloader) updateIssues() { } prev = -1 failCount = 0 - // When canceled one of these will randomly be chosen, c.downloadQueue won't be closed until after this function returns select { - case <-c.Context.Done(): - return case c.downloadQueue <- issue: } c.fileList = ch.Insert(c.fileList, fmt.Sprintf("cv-%v.json", offset)) log.Printf("Downloaded %s/cv-%v.json", c.JSONPath, offset) } + return offset } type download struct { @@ -328,16 +328,9 @@ func (c *CVDownloader) start_downloader() { go func() { log.Println("starting downloader", i) for dl := range c.imageDownloads { - if c.hasQuit() { - c.imageWG.Done() - continue // We must continue so that c.imageWG will complete otherwise it will hang forever - } if dl.finished { select { - case <-c.Context.Done(): - c.imageWG.Done() - continue case c.FinishedDownloadQueue <- Download{ URL: dl.url, Dest: dl.dest, @@ -348,7 +341,7 @@ func (c *CVDownloader) start_downloader() { continue } dir := filepath.Dir(dl.dest) - resp, err, cancelDownload := Get(c.Context, dl.url) + resp, err, cancelDownload := Get(dl.url) if err != nil { cancelDownload() log.Println("Failed to download", dl.volumeID, "/", dl.issueID, dl.url, err) @@ -449,9 +442,16 @@ func (c *CVDownloader) downloadImages() { } imageURLs := []i{{issue.Image.IconURL, "icon_url"}, {issue.Image.MediumURL, "medium_url"}, {issue.Image.ScreenURL, "screen_url"}, {issue.Image.ScreenLargeURL, "screen_large_url"}, {issue.Image.SmallURL, "small_url"}, {issue.Image.SuperURL, "super_url"}, {issue.Image.ThumbURL, "thumb_url"}, {issue.Image.TinyURL, "tiny_url"}, {issue.Image.OriginalURL, "original_url"}} for _, image := range imageURLs { - if c.hasQuit() { - return + if strings.HasSuffix(image.url, "6373148-blank.png") { + c.notFound <- download{ + url: image.url, + offset: list.Offset, + volumeID: issue.Volume.ID, + issueID: issue.ID, + } + continue } + if len(c.ImageTypes) > 0 && !slices.Contains(c.ImageTypes, image.name) { continue } @@ -469,6 +469,7 @@ func (c *CVDownloader) downloadImages() { issueID: issue.ID, finished: true, } + continue } ext := strings.TrimSuffix(strings.ToLower(path.Ext(uri.Path)), "~original") if ext == "" || (len(ext) > 4 && !slices.Contains([]string{".avif", ".webp", ".tiff", ".heif"}, ext)) { @@ -477,7 +478,11 @@ func (c *CVDownloader) downloadImages() { dir := filepath.Join(c.ImagePath, strconv.Itoa(issue.Volume.ID), strconv.Itoa(issue.ID)) path := filepath.Join(dir, image.name+ext) - if c.chdb.PathDownloaded(path) { + ids := c.get_id(ch.ID{ + Domain: ch.ComicVine, + ID: strconv.Itoa(issue.ID), + }) + if c.chdb.PathDownloaded(path) || c.only_hash_new_ids && len(ids) > 0 { if _, err = os.Stat(path); c.SendExistingImages && err == nil { // We don't add to the count of added as these should be processed immediately log.Printf("Sending Existing image %v/%v %v", issue.Volume.ID, issue.ID, path) @@ -516,8 +521,6 @@ func (c *CVDownloader) downloadImages() { t := 10 * time.Second log.Println("Waiting for", t, "at offset", list.Offset, "had to wait for", waited) select { - case <-c.Context.Done(): // allows us to return immediately even during a timeout - return case <-time.After(t): } } else { @@ -543,9 +546,6 @@ list: } for _, issue := range list.Results { for _, url := range []string{issue.Image.IconURL, issue.Image.MediumURL, issue.Image.ScreenURL, issue.Image.ScreenLargeURL, issue.Image.SmallURL, issue.Image.SuperURL, issue.Image.ThumbURL, issue.Image.TinyURL, issue.Image.OriginalURL} { - if c.hasQuit() { - return ErrQuit - } if c.chdb.CheckURL(url) { indexesToRemove = append(indexesToRemove, i) if err := os.Remove(filepath.Join(c.JSONPath, jsonFile)); err != nil { @@ -590,7 +590,7 @@ func (c *CVDownloader) cleanDirs() { }) } -func NewCVDownloader(ctx context.Context, bufPool *sync.Pool, chdb ch.CHDB, workPath, APIKey string, imageTypes []string, keepDownloadedImages, sendExistingImages bool, finishedDownloadQueue chan Download) *CVDownloader { +func NewCVDownloader(ctx context.Context, bufPool *sync.Pool, only_hash_new_ids bool, get_id func(id ch.ID) ch.IDList, chdb ch.CHDB, workPath, APIKey string, imageTypes []string, keepDownloadedImages, sendExistingImages bool, finishedDownloadQueue chan Download) *CVDownloader { return &CVDownloader{ Context: ctx, JSONPath: filepath.Join(workPath, "_json"), @@ -602,6 +602,8 @@ func NewCVDownloader(ctx context.Context, bufPool *sync.Pool, chdb ch.CHDB, work KeepDownloadedImages: keepDownloadedImages, ImageTypes: imageTypes, chdb: chdb, + get_id: get_id, + only_hash_new_ids: only_hash_new_ids, } } @@ -609,9 +611,9 @@ func DownloadCovers(c *CVDownloader) { var ( err error ) - c.downloadQueue = make(chan *CVResult, 100) // This is just json it shouldn't take up much more than 122 MB - c.imageDownloads = make(chan download, 1) // These are just URLs should only take a few MB - c.notFound = make(chan download, 1) // Same here + c.downloadQueue = make(chan *CVResult) // This is just json it shouldn't take up much more than 122 MB + c.imageDownloads = make(chan download, 1) // These are just URLs should only take a few MB + c.notFound = make(chan download, 1) // Same here os.MkdirAll(c.JSONPath, 0o777) f, _ := os.Create(filepath.Join(c.ImagePath, ".keep")) f.Close() @@ -643,7 +645,7 @@ func DownloadCovers(c *CVDownloader) { dwg.Done() }() - c.updateIssues() + offset := c.updateIssues() issueCount := len(c.fileList) * 100 log.Println("Number of issues", issueCount, " expected:", c.totalResults) @@ -654,15 +656,19 @@ func DownloadCovers(c *CVDownloader) { log.Println("Waiting for downloaders") dwg.Wait() close(c.imageDownloads) - for range c.imageDownloads { + for dw := range c.imageDownloads { + fmt.Println("Skipping cv download", dw.issueID) } close(c.notFound) - for range c.notFound { + for dw := range c.notFound { + fmt.Println("Skipping not found", dw.issueID) } // We drain this at the end because we need to wait for the images to download - for range c.downloadQueue { + for dw := range c.downloadQueue { + fmt.Println("Skipping page download", dw.Offset) } log.Println("Completed downloading images") + log.Println("Last offset", offset) } diff --git a/go.mod b/go.mod index ab38c56..32a6f60 100644 --- a/go.mod +++ b/go.mod @@ -1,25 +1,23 @@ module gitea.narnian.us/lordwelch/comic-hasher -go 1.22.0 +go 1.23.0 -toolchain go1.22.5 +toolchain go1.24.0 require ( - gitea.narnian.us/lordwelch/goimagehash v0.0.0-20250113012632-72c18ebad3c6 + gitea.narnian.us/lordwelch/goimagehash v0.0.0-20250130004139-e91c39c79e0d + github.com/disintegration/imaging v1.6.3-0.20201218193011-d40f48ce0f09 github.com/fmartingr/go-comicinfo/v2 v2.0.2 github.com/kr/pretty v0.1.0 github.com/mattn/go-sqlite3 v1.14.24 github.com/mholt/archiver/v4 v4.0.0-alpha.8 - github.com/ncruces/go-sqlite3 v0.22.0 - golang.org/x/image v0.23.0 - golang.org/x/text v0.21.0 + github.com/ncruces/go-sqlite3 v0.23.1 + github.com/vmihailenco/msgpack v4.0.4+incompatible + go.etcd.io/bbolt v1.4.0 + golang.org/x/image v0.24.0 + golang.org/x/text v0.22.0 gonum.org/v1/gonum v0.15.1 - modernc.org/sqlite v1.34.5 -) - -require ( - github.com/vmihailenco/msgpack/v5 v5.4.1 - github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect + modernc.org/sqlite v1.35.0 ) require ( @@ -28,10 +26,10 @@ require ( github.com/bodgit/sevenzip v1.3.0 // indirect github.com/bodgit/windows v1.0.0 // indirect github.com/connesc/cipherio v0.2.1 // indirect - github.com/disintegration/imaging v1.6.3-0.20201218193011-d40f48ce0f09 // indirect github.com/dsnet/compress v0.0.1 // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/golang/mock v1.6.0 // indirect + github.com/golang/protobuf v1.5.4 // indirect github.com/golang/snappy v0.0.4 // indirect github.com/google/uuid v1.6.0 // indirect github.com/hashicorp/errwrap v1.0.0 // indirect @@ -45,15 +43,17 @@ require ( github.com/nwaples/rardecode/v2 v2.0.0-beta.2 // indirect github.com/pierrec/lz4/v4 v4.1.15 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect - github.com/tetratelabs/wazero v1.8.2 // indirect + github.com/tetratelabs/wazero v1.9.0 // indirect github.com/therootcompany/xz v1.0.1 // indirect github.com/ulikunitz/xz v0.5.10 // indirect go4.org v0.0.0-20200411211856-f5505b9728dd // indirect - golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa // indirect - golang.org/x/sys v0.29.0 // indirect - modernc.org/libc v1.55.3 // indirect - modernc.org/mathutil v1.6.0 // indirect - modernc.org/memory v1.8.0 // indirect + golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect + golang.org/x/sys v0.30.0 // indirect + google.golang.org/appengine v1.6.8 // indirect + google.golang.org/protobuf v1.36.5 // indirect + modernc.org/libc v1.61.13 // indirect + modernc.org/mathutil v1.7.1 // indirect + modernc.org/memory v1.8.2 // indirect ) replace golang.org/x/text v0.17.0 => github.com/lordwelch/text v0.0.0-20240505231825-4893f344170f diff --git a/go.sum b/go.sum index 2e886e7..8bf1ce7 100644 --- a/go.sum +++ b/go.sum @@ -15,8 +15,8 @@ cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+ cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= -gitea.narnian.us/lordwelch/goimagehash v0.0.0-20250113012632-72c18ebad3c6 h1:DqwlGXgaLjXVEio1+podh25e7q/phY02aTMsYkfryqQ= -gitea.narnian.us/lordwelch/goimagehash v0.0.0-20250113012632-72c18ebad3c6/go.mod h1:q+HjeXYjflX3nk3qt74Gho8z+6MGe5lZO/Po+kiUK7E= +gitea.narnian.us/lordwelch/goimagehash v0.0.0-20250130004139-e91c39c79e0d h1:mFnVC/tEHk6woq6FBulwzGcuNdYn+zNhXNBILuetQJs= +gitea.narnian.us/lordwelch/goimagehash v0.0.0-20250130004139-e91c39c79e0d/go.mod h1:UDwa7njhbB5nzxIjHbT9Mjlve9GYn3wzxAcQax1XRvE= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY= @@ -65,6 +65,10 @@ github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= @@ -73,6 +77,9 @@ github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5a github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= @@ -111,8 +118,8 @@ github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBW github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/mholt/archiver/v4 v4.0.0-alpha.8 h1:tRGQuDVPh66WCOelqe6LIGh0gwmfwxUrSSDunscGsRM= github.com/mholt/archiver/v4 v4.0.0-alpha.8/go.mod h1:5f7FUYGXdJWUjESffJaYR4R60VhnHxb2X3T1teMyv5A= -github.com/ncruces/go-sqlite3 v0.22.0 h1:FkGSBhd0TY6e66k1LVhyEpA+RnG/8QkQNed5pjIk4cs= -github.com/ncruces/go-sqlite3 v0.22.0/go.mod h1:ueXOZXYZS2OFQirCU3mHneDwJm5fGKHrtccYBeGEV7M= +github.com/ncruces/go-sqlite3 v0.23.1 h1:zGAd76q+Tr18z/xKGatUlzBQdjR3J+rexfANUcjAgkY= +github.com/ncruces/go-sqlite3 v0.23.1/go.mod h1:Xg3FyAZl25HcBSFmcbymdfoTqD7jRnBUmv1jSrbIjdE= github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4= github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/ncruces/julianday v1.0.0 h1:fH0OKwa7NWvniGQtxdJRxAgkBMolni2BjDHaWTxqt7M= @@ -131,20 +138,21 @@ github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd/go.mod h1:hPqNNc0 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= -github.com/tetratelabs/wazero v1.8.2 h1:yIgLR/b2bN31bjxwXHD8a3d+BogigR952csSDdLYEv4= -github.com/tetratelabs/wazero v1.8.2/go.mod h1:yAI0XTsMBhREkM/YDAK/zNou3GoiAce1P6+rp/wQhjs= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tetratelabs/wazero v1.9.0 h1:IcZ56OuxrtaEz8UYNRHBrUa9bYeX9oVY93KspZZBf/I= +github.com/tetratelabs/wazero v1.9.0/go.mod h1:TSbcXCfFP0L2FGkRPxHphadXPjo1T6W+CseNNY7EkjM= github.com/therootcompany/xz v1.0.1 h1:CmOtsn1CbtmyYiusbfmhmkpAAETj0wBIH6kCYaX+xzw= github.com/therootcompany/xz v1.0.1/go.mod h1:3K3UH1yCKgBneZYhuQUvJ9HPD19UEXEI0BWbMn8qNMY= github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8= github.com/ulikunitz/xz v0.5.10 h1:t92gobL9l3HE202wg3rlk19F6X+JOxl9BBrCCMYEYd8= github.com/ulikunitz/xz v0.5.10/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= -github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IUPn0Bjt8= -github.com/vmihailenco/msgpack/v5 v5.4.1/go.mod h1:GaZTsDaehaPpQVyxrf5mtQlH+pc21PIudVV/E3rRQok= -github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g= -github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds= +github.com/vmihailenco/msgpack v4.0.4+incompatible h1:dSLoQfGFAo3F6OoNhwUmLwVgaUXK79GlxNBwueZn0xI= +github.com/vmihailenco/msgpack v4.0.4+incompatible/go.mod h1:fy3FlTQTDXWkZ7Bh6AcGMlsjHatGryHQYUTf1ShIgkk= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.etcd.io/bbolt v1.4.0 h1:TU77id3TnN/zKr7CO/uk+fBCwF2jGcMuw2B/FMAzYIk= +go.etcd.io/bbolt v1.4.0/go.mod h1:AsD+OCi/qPN1giOX1aiLAha3o1U8rAz65bvN4j0sRuk= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= @@ -155,6 +163,7 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -163,13 +172,13 @@ golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= -golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa h1:ELnwvuAXPNtPk1TJRuGkI9fDTwym6AYBu0qzT8AcHdI= -golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa/go.mod h1:akd2r19cwCdwSwWeIdzYQGa/EZZyqcOdwWiwj5L5eKQ= +golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa h1:t2QcU6V556bFjYgu4L6C+6VrCPyJZ+eyRsABUPs1mz4= +golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa/go.mod h1:BHOTPb3L19zxehTsLoJXVaTktb06DFgmdW6Wb9s8jqk= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= -golang.org/x/image v0.23.0 h1:HseQ7c2OpPKTPVzNjG5fwJsOTCiiwS4QdsYi5XU6H68= -golang.org/x/image v0.23.0/go.mod h1:wJJBTdLfCCf3tiHa1fNxpZmUI4mmoZvwMCPP0ddoNKY= +golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ= +golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -186,8 +195,9 @@ golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= -golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.23.0 h1:Zb7khfcRGKk+kqfxFaP5tZqCnDZMjC5VtUBs87Hr6QM= +golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -202,7 +212,9 @@ golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -215,8 +227,9 @@ golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= -golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= +golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -233,17 +246,23 @@ golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU= -golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= +golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= +golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -271,8 +290,9 @@ golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapK golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= -golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= -golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.30.0 h1:BgcpHewrV5AUp2G9MebG4XPFI1E2W41zU1SaqVA9vJY= +golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -292,6 +312,8 @@ google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7 google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= +google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= @@ -312,7 +334,12 @@ google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyac google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= +google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= @@ -324,28 +351,28 @@ honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWh honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= -modernc.org/cc/v4 v4.21.4 h1:3Be/Rdo1fpr8GrQ7IVw9OHtplU4gWbb+wNgeoBMmGLQ= -modernc.org/cc/v4 v4.21.4/go.mod h1:HM7VJTZbUCR3rV8EYBi9wxnJ0ZBRiGE5OeGXNA0IsLQ= -modernc.org/ccgo/v4 v4.19.2 h1:lwQZgvboKD0jBwdaeVCTouxhxAyN6iawF3STraAal8Y= -modernc.org/ccgo/v4 v4.19.2/go.mod h1:ysS3mxiMV38XGRTTcgo0DQTeTmAO4oCmJl1nX9VFI3s= +modernc.org/cc/v4 v4.24.4 h1:TFkx1s6dCkQpd6dKurBNmpo+G8Zl4Sq/ztJ+2+DEsh0= +modernc.org/cc/v4 v4.24.4/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= +modernc.org/ccgo/v4 v4.23.16 h1:Z2N+kk38b7SfySC1ZkpGLN2vthNJP1+ZzGZIlH7uBxo= +modernc.org/ccgo/v4 v4.23.16/go.mod h1:nNma8goMTY7aQZQNTyN9AIoJfxav4nvTnvKThAeMDdo= modernc.org/fileutil v1.3.0 h1:gQ5SIzK3H9kdfai/5x41oQiKValumqNTDXMvKo62HvE= modernc.org/fileutil v1.3.0/go.mod h1:XatxS8fZi3pS8/hKG2GH/ArUogfxjpEKs3Ku3aK4JyQ= -modernc.org/gc/v2 v2.4.1 h1:9cNzOqPyMJBvrUipmynX0ZohMhcxPtMccYgGOJdOiBw= -modernc.org/gc/v2 v2.4.1/go.mod h1:wzN5dK1AzVGoH6XOzc3YZ+ey/jPgYHLuVckd62P0GYU= -modernc.org/libc v1.55.3 h1:AzcW1mhlPNrRtjS5sS+eW2ISCgSOLLNyFzRh/V3Qj/U= -modernc.org/libc v1.55.3/go.mod h1:qFXepLhz+JjFThQ4kzwzOjA/y/artDeg+pcYnY+Q83w= -modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4= -modernc.org/mathutil v1.6.0/go.mod h1:Ui5Q9q1TR2gFm0AQRqQUaBWFLAhQpCwNcuhBOSedWPo= -modernc.org/memory v1.8.0 h1:IqGTL6eFMaDZZhEWwcREgeMXYwmW83LYW8cROZYkg+E= -modernc.org/memory v1.8.0/go.mod h1:XPZ936zp5OMKGWPqbD3JShgd/ZoQ7899TUuQqxY+peU= -modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4= -modernc.org/opt v0.1.3/go.mod h1:WdSiB5evDcignE70guQKxYUl14mgWtbClRi5wmkkTX0= -modernc.org/sortutil v1.2.0 h1:jQiD3PfS2REGJNzNCMMaLSp/wdMNieTbKX920Cqdgqc= -modernc.org/sortutil v1.2.0/go.mod h1:TKU2s7kJMf1AE84OoiGppNHJwvB753OYfNl2WRb++Ss= -modernc.org/sqlite v1.34.5 h1:Bb6SR13/fjp15jt70CL4f18JIN7p7dnMExd+UFnF15g= -modernc.org/sqlite v1.34.5/go.mod h1:YLuNmX9NKs8wRNK2ko1LW1NGYcc9FkBO69JOt1AR9JE= -modernc.org/strutil v1.2.0 h1:agBi9dp1I+eOnxXeiZawM8F4LawKv4NzGWSaLfyeNZA= -modernc.org/strutil v1.2.0/go.mod h1:/mdcBmfOibveCTBxUl5B5l6W+TTH1FXPLHZE6bTosX0= +modernc.org/gc/v2 v2.6.3 h1:aJVhcqAte49LF+mGveZ5KPlsp4tdGdAOT4sipJXADjw= +modernc.org/gc/v2 v2.6.3/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= +modernc.org/libc v1.61.13 h1:3LRd6ZO1ezsFiX1y+bHd1ipyEHIJKvuprv0sLTBwLW8= +modernc.org/libc v1.61.13/go.mod h1:8F/uJWL/3nNil0Lgt1Dpz+GgkApWh04N3el3hxJcA6E= +modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= +modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= +modernc.org/memory v1.8.2 h1:cL9L4bcoAObu4NkxOlKWBWtNHIsnnACGF/TbqQ6sbcI= +modernc.org/memory v1.8.2/go.mod h1:ZbjSvMO5NQ1A2i3bWeDiVMxIorXwdClKE/0SZ+BMotU= +modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8= +modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= +modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= +modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= +modernc.org/sqlite v1.35.0 h1:yQps4fegMnZFdphtzlfQTCNBWtS0CZv48pRpW3RFHRw= +modernc.org/sqlite v1.35.0/go.mod h1:9cr2sicr7jIaWTBKQmAxQLfBv9LL0su4ZTEV+utt3ic= +modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= +modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= diff --git a/hashing.go b/hashing.go index 588c309..b2caa23 100644 --- a/hashing.go +++ b/hashing.go @@ -39,7 +39,8 @@ const ( ) const ( - ComicVine Source = "comicvine.gamespot.com" + ComicVine Source = "comicvine.gamespot.com" + SavedHashVersion int = 2 ) type Source string @@ -78,16 +79,9 @@ type Hash struct { } // IDList is a map of domain to ID eg IDs["comicvine.gamespot.com"] = []string{"1235"} -// Maps are extremely expensive in go for small maps this should only be used to return info to a user no internal code should use this +// Maps are extremely expensive in go for small maps this should only be used to return info to a user or as a map containing all IDs for a source type IDList map[Source][]string -type OldSavedHashes map[Source]map[string][3]uint64 - -type SavedHashes struct { - IDs [][]ID - Hashes [3]map[uint64]int -} - func ToIDList(ids []ID) IDList { idlist := IDList{} for _, id := range ids { @@ -96,10 +90,10 @@ func ToIDList(ids []ID) IDList { return idlist } func InsertID(ids []ID, id ID) []ID { - index, itemFound := slices.BinarySearchFunc(ids, id, func(e ID, t ID) int { + index, itemFound := slices.BinarySearchFunc(ids, id, func(existing ID, target ID) int { return cmp.Or( - cmp.Compare(e.Domain, t.Domain), - cmp.Compare(e.ID, t.ID), + cmp.Compare(existing.Domain, target.Domain), + cmp.Compare(existing.ID, target.ID), ) }) if itemFound { @@ -107,52 +101,6 @@ func InsertID(ids []ID, id ID) []ID { } return slices.Insert(ids, index, id) } -func (s *SavedHashes) InsertHash(hash Hash, id ID) { - for i, h := range s.Hashes { - if h == nil { - s.Hashes[i] = make(map[uint64]int) - } - } - - hashType := int(hash.Kind) - 1 - idx, hashFound := s.Hashes[hashType][hash.Hash] - if !hashFound { - idx = len(s.IDs) - s.IDs = append(s.IDs, make([]ID, 0, 3)) - } - s.IDs[idx] = InsertID(s.IDs[idx], id) - s.Hashes[hashType][hash.Hash] = idx -} - -func ConvertSavedHashes(oldHashes OldSavedHashes) SavedHashes { - t := SavedHashes{} - idcount := 0 - for _, ids := range oldHashes { - idcount += len(ids) - } - t.IDs = make([][]ID, 0, idcount) - t.Hashes[0] = make(map[uint64]int, idcount) - t.Hashes[1] = make(map[uint64]int, idcount) - t.Hashes[2] = make(map[uint64]int, idcount) - for domain, sourceHashes := range oldHashes { - for id, hashes := range sourceHashes { - idx := len(t.IDs) - t.IDs = append(t.IDs, []ID{{domain, id}}) - for hashType, hash := range hashes { - t.Hashes[hashType][hash] = idx - } - } - } - fmt.Println("Expected number of IDs", idcount) - idcount = 0 - for _, ids := range t.IDs { - idcount += len(ids) - } - fmt.Println("length of hashes", len(t.Hashes[0])+len(t.Hashes[1])+len(t.Hashes[2])) - fmt.Println("Length of ID lists", len(t.IDs)) - fmt.Println("Total number of IDs", idcount) - return t -} type NewIDs struct { OldID ID @@ -169,7 +117,7 @@ type HashStorage interface { } func Atleast(maxDistance int, searchHash uint64, hashes []uint64) []Match { - matchingHashes := make([]Match, 0, len(hashes)/2) // hope that we don't need all of them + matchingHashes := make([]Match, 0, 20) // hope that we don't need all of them for _, storedHash := range hashes { distance := bits.OnesCount64(searchHash ^ storedHash) if distance <= maxDistance { diff --git a/map.go b/map.go index 031273d..2969c4c 100644 --- a/map.go +++ b/map.go @@ -1,150 +1,156 @@ package ch import ( - "cmp" "fmt" "slices" "sync" + + "gitea.narnian.us/lordwelch/goimagehash" ) type MapStorage struct { basicMapStorage - partialHash [3][8]map[uint8][]uint64 + partialAHash [8]map[uint8][]uint64 + partialDHash [8]map[uint8][]uint64 + partialPHash [8]map[uint8][]uint64 } func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) { - var foundMatches []Result + var ( + foundMatches []Result + tl timeLog + ) m.hashMutex.RLock() defer m.hashMutex.RUnlock() - resetTime() - defer logTime("Search Complete") - if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate - for _, hash := range hashes { - hashType := int(hash.Kind) - 1 - index, hashFound := m.findHash(hashType, hash.Hash) - if hashFound { - foundMatches = append(foundMatches, Result{ - Distance: 0, - Hash: hash, - IDs: ToIDList(*m.hashes[hashType][index].ids), - }) - } - } - - // If we have exact matches don't bother with other matches - logTime("Search Exact") - if len(foundMatches) > 0 && exactOnly { - return foundMatches, nil - } + if exactOnly { + return m.basicMapStorage.GetMatches(hashes, max, exactOnly) } + tl.resetTime() + defer tl.logTime("Search Complete") totalPartialHashes := 0 + for _, searchHash := range hashes { - foundHashes := make(map[uint64]struct{}) - hashType := int(searchHash.Kind) - 1 + currentHashes, currentPartialHashes := m.getCurrentHashes(searchHash.Kind) + potentialMatches := []uint64{} + for i, partialHash := range SplitHash(searchHash.Hash) { - partialHashes := m.partialHash[hashType][i][partialHash] - totalPartialHashes += len(partialHashes) - for _, match := range Atleast(max, searchHash.Hash, partialHashes) { - _, alreadyMatched := foundHashes[match.Hash] - if index, hashFound := m.findHash(hashType, match.Hash); hashFound && !alreadyMatched { - foundHashes[match.Hash] = struct{}{} - foundMatches = append(foundMatches, Result{IDs: ToIDList(*m.hashes[hashType][index].ids), Distance: match.Distance, Hash: Hash{Hash: match.Hash, Kind: searchHash.Kind}}) + potentialMatches = append(potentialMatches, currentPartialHashes[i][partialHash]...) + } + + totalPartialHashes += len(potentialMatches) + mappedIds := map[*[]ID]bool{} + + for _, match := range Atleast(max, searchHash.Hash, potentialMatches) { + matchedHash := Hash{match.Hash, searchHash.Kind} + index, count := m.findHash(matchedHash) + if count < 1 { + continue + } + for _, storedHash := range currentHashes[index : index+count] { + ids := m.ids[storedHash.ID] + if mappedIds[ids] { + continue } + mappedIds[ids] = true + + foundMatches = append(foundMatches, Result{ + Distance: 0, + Hash: storedHash.Hash, + IDs: ToIDList(*m.ids[storedHash.ID]), + }) + } } } fmt.Println("Total partial hashes tested:", totalPartialHashes) - go m.printSizes() return foundMatches, nil } +// getCurrentHashes must have a read lock before using +func (m *MapStorage) getCurrentHashes(kind goimagehash.Kind) ([]SavedHash, [8]map[uint8][]uint64) { + if kind == goimagehash.AHash { + return m.aHashes, m.partialAHash + } + if kind == goimagehash.DHash { + return m.dHashes, m.partialDHash + } + if kind == goimagehash.PHash { + return m.pHashes, m.partialPHash + } + panic("Unknown hash type: " + kind.String()) +} + func (m *MapStorage) MapHashes(hash ImageHash) { m.basicMapStorage.MapHashes(hash) for _, hash := range hash.Hashes { - hashType := int(hash.Kind) - 1 + _, partialHashes := m.getCurrentHashes(hash.Kind) for i, partialHash := range SplitHash(hash.Hash) { - m.partialHash[hashType][i][partialHash] = Insert(m.partialHash[hashType][i][partialHash], hash.Hash) + partialHashes[i][partialHash] = Insert(partialHashes[i][partialHash], hash.Hash) } } } func (m *MapStorage) DecodeHashes(hashes SavedHashes) error { - for hashType, sourceHashes := range hashes.Hashes { - m.hashes[hashType] = make([]structHash, len(sourceHashes)) - for savedHash, idlistLocation := range sourceHashes { - m.hashes[hashType] = append(m.hashes[hashType], structHash{savedHash, &hashes.IDs[idlistLocation]}) - } + if err := m.basicMapStorage.DecodeHashes(hashes); err != nil { + return err } - for hashType := range m.hashes { - slices.SortFunc(m.hashes[hashType], func(a, b structHash) int { - return cmp.Compare(a.hash, b.hash) - }) - } - m.printSizes() - for _, partialHashes := range m.partialHash { - for _, partMap := range partialHashes { - for part, hashes := range partMap { - slices.Sort(hashes) - partMap[part] = slices.Compact(hashes) - } - } - } - m.printSizes() + + mapPartialHashes(m.aHashes, m.partialAHash) + mapPartialHashes(m.dHashes, m.partialDHash) + mapPartialHashes(m.pHashes, m.partialPHash) + + compactPartialHashes(m.partialAHash) + compactPartialHashes(m.partialDHash) + compactPartialHashes(m.partialPHash) + return nil } -func (m *MapStorage) printSizes() { - fmt.Println("Length of hashes:", len(m.hashes[0])+len(m.hashes[1])+len(m.hashes[2])) - // fmt.Println("Size of", "hashes:", size.Of(m.hashes)/1024/1024, "MB") - // fmt.Println("Size of", "ids:", size.Of(m.ids)/1024/1024, "MB") - // fmt.Println("Size of", "MapStorage:", size.Of(m)/1024/1024, "MB") - -} - func NewMapStorage() (HashStorage, error) { + storage := &MapStorage{ basicMapStorage: basicMapStorage{ hashMutex: &sync.RWMutex{}, - hashes: [3][]structHash{ - []structHash{}, - []structHash{}, - []structHash{}, - }, - }, - partialHash: [3][8]map[uint8][]uint64{ - { - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - }, - { - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - }, - { - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - make(map[uint8][]uint64), - }, + ids: make(map[ID]*[]ID), + aHashes: []SavedHash{}, + dHashes: []SavedHash{}, + pHashes: []SavedHash{}, }, + partialAHash: newPartialHash(), + partialDHash: newPartialHash(), + partialPHash: newPartialHash(), } return storage, nil } + +func newPartialHash() [8]map[uint8][]uint64 { + return [8]map[uint8][]uint64{ + map[uint8][]uint64{}, + map[uint8][]uint64{}, + map[uint8][]uint64{}, + map[uint8][]uint64{}, + map[uint8][]uint64{}, + map[uint8][]uint64{}, + map[uint8][]uint64{}, + map[uint8][]uint64{}, + } +} + +func mapPartialHashes(hashes []SavedHash, partialHashMap [8]map[uint8][]uint64) { + for _, savedHash := range hashes { + for i, partialHash := range SplitHash(savedHash.Hash.Hash) { + partialHashMap[i][partialHash] = append(partialHashMap[i][partialHash], savedHash.Hash.Hash) + } + } +} + +func compactPartialHashes(partialHashMap [8]map[uint8][]uint64) { + for _, partMap := range partialHashMap { + for part, hashes := range partMap { + slices.Sort(hashes) + partMap[part] = slices.Compact(hashes) + } + } +} diff --git a/savedHashes.go b/savedHashes.go new file mode 100644 index 0000000..386c04f --- /dev/null +++ b/savedHashes.go @@ -0,0 +1,254 @@ +package ch + +import ( + "cmp" + "encoding/json" + "errors" + "fmt" + "slices" + "strings" + + "gitea.narnian.us/lordwelch/goimagehash" + "github.com/vmihailenco/msgpack" +) + +type Format int + +const ( + Msgpack Format = iota + 1 + JSON + + CurrentSavedHashesVersion int = 2 +) + +var versionMap map[int]versionDecoder + +var formatNames = map[Format]string{ + JSON: "json", + Msgpack: "msgpack", +} + +var formatValues = map[string]Format{ + "json": JSON, + "msgpack": Msgpack, +} + +type OldSavedHashes map[Source]map[string][3]uint64 +type SavedHashesv1 struct { + IDs [][]ID + Hashes [3]map[uint64]int +} + +// SavedHashes The IDs and Hashes fields have no direct correlation +// It is perfectly valid to have an empty IDs or an empty Hashes field +// If two covers have identical hashes then they should be two entries in Hashes not a set in IDs with two IDs from the same source +type SavedHashes struct { + Version int + IDs [][]ID // List of sets of IDs that are the same across Sources, should generally only have one Source per set + Hashes []SavedHash // List of all known hashes, hashes will be duplicated for each source +} + +type SavedHash struct { + Hash Hash + ID ID +} +type Encoder func(any) ([]byte, error) +type Decoder func([]byte, interface{}) error +type versionDecoder func(Decoder, []byte) (*SavedHashes, error) + +var NoHashes = errors.New("no hashes") +var DecodeError = errors.New("decoder failure") + +func (f Format) String() string { + if name, known := formatNames[f]; known { + return name + } + return "Unknown" +} + +func (f *Format) Set(s string) error { + if format, known := formatValues[strings.ToLower(s)]; known { + *f = format + } else { + return fmt.Errorf("Unknown format: %d", f) + } + return nil +} + +func (s *SavedHashes) InsertHash(hash Hash, id ID) { + h := SavedHash{ + hash, + id, + } + index, itemFound := slices.BinarySearchFunc(s.Hashes, h, func(existing SavedHash, target SavedHash) int { + return cmp.Or( + cmp.Compare(existing.Hash.Hash, target.Hash.Hash), + cmp.Compare(existing.Hash.Kind, target.Hash.Kind), + cmp.Compare(existing.ID.Domain, target.ID.Domain), + cmp.Compare(existing.ID.ID, target.ID.ID), + ) + }) + if !itemFound { + s.Hashes = slices.Insert(s.Hashes, index, h) + } +} + +func ConvertHashesV0(oldHashes OldSavedHashes) *SavedHashes { + t := SavedHashes{} + idcount := 0 + for _, ids := range oldHashes { + idcount += len(ids) + } + t.IDs = make([][]ID, 0, idcount) + t.Hashes = make([]SavedHash, 0, idcount) + for domain, sourceHashes := range oldHashes { + for id, hashes := range sourceHashes { + t.IDs = append(t.IDs, []ID{{domain, id}}) + for hashType, hash := range hashes { + t.Hashes = append(t.Hashes, SavedHash{ + Hash: Hash{ + Kind: goimagehash.Kind(hashType + 1), + Hash: hash, + }, + ID: ID{domain, id}, + }) + } + } + } + fmt.Println("length of hashes", len(t.Hashes)) + fmt.Println("Length of ID lists", len(t.IDs)) + return &t +} + +func ConvertHashesV1(oldHashes SavedHashesv1) *SavedHashes { + t := SavedHashes{} + hashCount := 0 + for _, hashes := range oldHashes.Hashes { + hashCount += len(hashes) + } + t.IDs = oldHashes.IDs + t.Hashes = make([]SavedHash, 0, hashCount) + for hashType, sourceHashes := range oldHashes.Hashes { + for hash, index := range sourceHashes { + for _, id := range oldHashes.IDs[index] { + t.Hashes = append(t.Hashes, SavedHash{ + ID: id, + Hash: Hash{ + Kind: goimagehash.Kind(hashType + 1), + Hash: hash, + }, + }) + } + } + } + fmt.Println("length of hashes", len(t.Hashes)) + fmt.Println("Length of ID lists", len(t.IDs)) + return &t +} + +func DecodeHashesV0(decode Decoder, hashes []byte) (*SavedHashes, error) { + loadedHashes := OldSavedHashes{} + err := decode(hashes, &loadedHashes) + if err != nil { + return nil, fmt.Errorf("%w: %w", DecodeError, err) + } + if len(loadedHashes) == 0 { + return nil, NoHashes + } + return ConvertHashesV0(loadedHashes), nil +} + +func DecodeHashesV1(decode Decoder, hashes []byte) (*SavedHashes, error) { + loadedHashes := SavedHashesv1{} + err := decode(hashes, &loadedHashes) + if err != nil { + return nil, fmt.Errorf("%w: %w", DecodeError, err) + } + hashesCount := 0 + for _, hashes := range loadedHashes.Hashes { + hashesCount += len(hashes) + } + if hashesCount < 1 { + return nil, NoHashes + } + return ConvertHashesV1(loadedHashes), nil +} + +func DecodeHashesV2(decode Decoder, hashes []byte) (*SavedHashes, error) { + loadedHashes := SavedHashes{} + err := decode(hashes, &loadedHashes) + if err != nil { + return nil, fmt.Errorf("%w: %w", DecodeError, err) + } + if len(loadedHashes.Hashes) < 1 && len(loadedHashes.IDs) < 1 { + return nil, NoHashes + } + + return &loadedHashes, nil +} + +func getSavedHashesVersion(decode Decoder, hashes []byte) (int, error) { + type version struct { + Version int + } + var savedVersion version + err := decode(hashes, &savedVersion) + if err != nil { + return -1, fmt.Errorf("%w: %w", DecodeError, err) + } + if savedVersion.Version > 1 { + return savedVersion.Version, nil + } + return -1, nil +} +func DecodeHashes(format Format, hashes []byte) (*SavedHashes, error) { + var decode Decoder + switch format { + case Msgpack: + decode = msgpack.Unmarshal + case JSON: + decode = json.Unmarshal + + default: + return nil, fmt.Errorf("Unknown format: %v", format) + } + version, err := getSavedHashesVersion(decode, hashes) + if err != nil { + return nil, err + } + + if decodeVersion, knownVersion := versionMap[version]; knownVersion { + return decodeVersion(decode, hashes) + } + + for _, decodeVersion := range []versionDecoder{ + DecodeHashesV0, + DecodeHashesV1, + DecodeHashesV2, + } { + loadedHashes, err := decodeVersion(decode, hashes) + if err == nil { + return loadedHashes, nil + } + if !errors.Is(err, NoHashes) { + return nil, err + } + } + + return nil, NoHashes +} + +func EncodeHashes(hashes SavedHashes, format Format) ([]byte, error) { + var encoder Encoder + switch format { + case Msgpack: + encoder = msgpack.Marshal + case JSON: + encoder = json.Marshal + default: + return nil, fmt.Errorf("Unknown format: %v", format) + } + + hashes.Version = CurrentSavedHashesVersion + return encoder(hashes) +} diff --git a/sqlite.go b/sqlite.go index 52be9b6..b73b584 100644 --- a/sqlite.go +++ b/sqlite.go @@ -8,7 +8,6 @@ import ( "log" "math/bits" "strings" - "time" "gitea.narnian.us/lordwelch/goimagehash" _ "modernc.org/sqlite" @@ -66,7 +65,7 @@ func (s *sqliteStorage) findExactHashes(statement *sql.Stmt, items ...interface{ return hashes, nil } -func (s *sqliteStorage) findPartialHashes(max int, search_hash int64, kind goimagehash.Kind) ([]sqliteHash, error) { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate +func (s *sqliteStorage) findPartialHashes(tl timeLog, max int, search_hash int64, kind goimagehash.Kind) ([]sqliteHash, error) { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate hashes := []sqliteHash{} statement, err := s.db.PrepareContext(context.Background(), `SELECT rowid,hash,kind FROM Hashes WHERE (kind=?) AND (((hash >> (0 * 8) & 0xFF)=(?2 >> (0 * 8) & 0xFF)) OR ((hash >> (1 * 8) & 0xFF)=(?2 >> (1 * 8) & 0xFF)) OR ((hash >> (2 * 8) & 0xFF)=(?2 >> (2 * 8) & 0xFF)) OR ((hash >> (3 * 8) & 0xFF)=(?2 >> (3 * 8) & 0xFF)) OR ((hash >> (4 * 8) & 0xFF)=(?2 >> (4 * 8) & 0xFF)) OR ((hash >> (5 * 8) & 0xFF)=(?2 >> (5 * 8) & 0xFF)) OR ((hash >> (6 * 8) & 0xFF)=(?2 >> (6 * 8) & 0xFF)) OR ((hash >> (7 * 8) & 0xFF)=(?2 >> (7 * 8) & 0xFF)));`) if err != nil { @@ -94,7 +93,7 @@ func (s *sqliteStorage) findPartialHashes(max int, search_hash int64, kind goima } } rows.Close() - logTime("Filter partial " + kind.String()) + tl.logTime("Filter partial " + kind.String()) statement, err = s.db.PrepareContext(context.Background(), `SELECT DISTINCT IDS.domain, IDs.id, id_hash.hashid FROM IDs JOIN id_hash ON IDs.rowid = id_hash.idid WHERE (id_hash.hashid in (`+strings.TrimRight(strings.Repeat("?,", len(hashes)), ",")+`)) ORDER BY IDs.domain, IDs.ID;`) if err != nil { @@ -171,35 +170,18 @@ ANALYZE; return nil } -var ( - total time.Duration - t = time.Now() -) - -func resetTime() { - total = 0 - t = time.Now() -} - -func logTime(log string) { - n := time.Now() - s := n.Sub(t) - t = n - total += s - fmt.Printf("total: %v, %s: %v\n", total, log, s) -} - func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) { var ( foundMatches []Result + tl timeLog ) - resetTime() + tl.resetTime() if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate statement, err := s.db.Prepare(`SELECT rowid,hash,kind FROM Hashes WHERE ` + strings.TrimSuffix(strings.Repeat("(hash=? AND kind=?) OR", len(hashes)), "OR") + `ORDER BY kind,hash;`) if err != nil { - logTime("Fail exact") + tl.logTime("Fail exact") return foundMatches, err } @@ -221,17 +203,17 @@ func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Re if len(foundMatches) > 0 && exactOnly { return foundMatches, nil } - logTime("Search Exact") + tl.logTime("Search Exact") } foundHashes := make(map[uint64]struct{}) for _, hash := range hashes { - hashes, err := s.findPartialHashes(max, int64(hash.Hash), hash.Kind) + hashes, err := s.findPartialHashes(tl, max, int64(hash.Hash), hash.Kind) if err != nil { return foundMatches, err } - logTime("Search partial " + hash.Kind.String()) + tl.logTime("Search partial " + hash.Kind.String()) for _, hash := range hashes { if _, alreadyMatched := foundHashes[hash.Hash.Hash]; !alreadyMatched { @@ -251,27 +233,26 @@ func (s *sqliteStorage) MapHashes(hash ImageHash) { if err != nil { panic(err) } - insertHashes, err := tx.Prepare(` -INSERT INTO Hashes (hash,kind) VALUES (?,?) ON CONFLICT DO UPDATE SET hash=?1 RETURNING hashid -`) + insertHashes, err := tx.Prepare(`INSERT INTO Hashes (hash,kind) VALUES (?,?) ON CONFLICT DO UPDATE SET hash=?1 RETURNING hashid`) if err != nil { panic(err) } - rows, err := tx.Query(` -INSERT INTO IDs (domain,id) VALUES (?,?) ON CONFLICT DO UPDATE SET domain=?1 RETURNING idid -`, hash.ID.Domain, hash.ID.ID) + + rows, err := tx.Query(`INSERT INTO IDs (domain,id) VALUES (?,?) ON CONFLICT DO UPDATE SET domain=?1 RETURNING idid`, hash.ID.Domain, hash.ID.ID) if err != nil { panic(err) } if !rows.Next() { - panic("Unable to insert IDs") + panic("Unable to insert ID") } + var id_id int64 err = rows.Scan(&id_id) if err != nil { panic(err) } rows.Close() + hash_ids := []int64{} for _, hash := range hash.Hashes { rows, err := insertHashes.Query(int64(hash.Hash), hash.Kind) @@ -280,21 +261,24 @@ INSERT INTO IDs (domain,id) VALUES (?,?) ON CONFLICT DO UPDATE SET domain=?1 RET } if !rows.Next() { - panic("Unable to insert IDs") + panic("Unable to insert Hash") } + var id int64 err = rows.Scan(&id) rows.Close() if err != nil { panic(err) } + hash_ids = append(hash_ids, id) } - var ids []any + var ids []any = make([]any, 0, len(hash_ids)+1) + ids = append(ids, id_id) for _, hash_id := range hash_ids { - ids = append(ids, hash_id, id_id) + ids = append(ids, hash_id) } - _, err = tx.Exec(`INSERT INTO id_hash (hashid,idid) VALUES `+strings.TrimSuffix(strings.Repeat("(?, ?),", len(hash_ids)), ",")+` ON CONFLICT DO NOTHING;`, ids...) + _, err = tx.Exec(`INSERT INTO id_hash (idid, hashid) VALUES `+strings.TrimSuffix(strings.Repeat("(?1, ?),", len(hash_ids)), ",")+` ON CONFLICT DO NOTHING;`, ids...) if err != nil { panic(fmt.Errorf("Failed inserting: %v,%v: %w", hash.ID.Domain, hash.ID.ID, err)) } @@ -311,16 +295,11 @@ func (s *sqliteStorage) DecodeHashes(hashes SavedHashes) error { return err } - for hashType, sourceHashes := range hashes.Hashes { - hashKind := goimagehash.Kind(hashType + 1) - for hash, idsLocations := range sourceHashes { - for _, id := range hashes.IDs[idsLocations] { - s.MapHashes(ImageHash{ - Hashes: []Hash{{hash, hashKind}}, - ID: id, - }) - } - } + for _, savedHash := range hashes.Hashes { + s.MapHashes(ImageHash{ + Hashes: []Hash{savedHash.Hash}, + ID: savedHash.ID, + }) } err = s.createIndexes() if err != nil { @@ -434,28 +413,27 @@ func NewSqliteStorage(db, path string) (HashStorage, error) { _, err = sqlite.db.Exec(` PRAGMA foreign_keys=ON; CREATE TABLE IF NOT EXISTS Hashes( - hashid INTEGER PRIMARY KEY, - hash INT NOT NULL, - kind int NOT NULL, + hashid INTEGER PRIMARY KEY, + hash INTEGER NOT NULL, + kind INTEGER NOT NULL, + id INTEGER NOT NULL, + FOREIGN KEY(id) REFERENCES IDs(idid), UNIQUE(kind, hash) ); CREATE TABLE IF NOT EXISTS IDs( id TEXT NOT NULL, domain TEXT NOT NULL, - idid INTEGER PRIMARY KEY, + idid INTEGER PRIMARY KEY, UNIQUE (domain, id) ); -CREATE INDEX IF NOT EXISTS id_domain ON IDs (domain, id); -CREATE TABLE IF NOT EXISTS id_hash( - hashid INTEGER, - idid INTEGER, - FOREIGN KEY(hashid) REFERENCES Hashes(hashid), - FOREIGN KEY(idid) REFERENCES IDs(idid) - UNIQUE (hashid, idid) +CREATE TABLE IF NOT EXISTS EquivalentIDs( + id INTEGER + groupid INTEGER, + FOREIGN KEY(idid) REFERENCES IDs(idid) + UNIQUE (groupid, id) ); - `) if err != nil { panic(err) diff --git a/timing.go b/timing.go new file mode 100644 index 0000000..85e06c9 --- /dev/null +++ b/timing.go @@ -0,0 +1,24 @@ +package ch + +import ( + "fmt" + "time" +) + +type timeLog struct { + total time.Duration + last time.Time +} + +func (t *timeLog) resetTime() { + t.total = 0 + t.last = time.Now() +} + +func (t *timeLog) logTime(log string) { + now := time.Now() + diff := now.Sub(t.last) + t.last = now + t.total += diff + fmt.Printf("total: %v, %s: %v\n", t.total, log, diff) +} diff --git a/vp-tree.go b/vp-tree.go index faa0b9d..933483f 100644 --- a/vp-tree.go +++ b/vp-tree.go @@ -10,12 +10,17 @@ import ( ) type VPTree struct { - trees [3]*vptree.Tree - hashes [3][]vptree.Comparable + aTree *vptree.Tree + dTree *vptree.Tree + pTree *vptree.Tree + ids map[ID]*[]ID + + aHashes []vptree.Comparable // temporary, only used for vptree creation + dHashes []vptree.Comparable // temporary, only used for vptree creation + pHashes []vptree.Comparable // temporary, only used for vptree creation } type VPHash struct { - Hash Hash - IDs []ID + SavedHash } func (h *VPHash) Distance(c vptree.Comparable) float64 { @@ -27,57 +32,108 @@ func (h *VPHash) Distance(c vptree.Comparable) float64 { } func (v *VPTree) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) { - var matches []Result - var exactMatches []Result - fmt.Println(hashes) + var ( + matches []Result + exactMatches []Result + tl timeLog + ) + tl.resetTime() + defer tl.logTime("Search Complete") + for _, hash := range hashes { results := vptree.NewDistKeeper(float64(max)) - hashType := int(hash.Kind) - 1 - v.trees[hashType].NearestSet(results, &VPHash{Hash: hash}) + + currentTree := v.getCurrentTree(hash.Kind) + currentTree.NearestSet(results, &VPHash{SavedHash{Hash: hash}}) + + mappedIds := map[*[]ID]bool{} for _, result := range results.Heap { - vphash := result.Comparable.(*VPHash) + storedHash := result.Comparable.(*VPHash) + ids := v.ids[storedHash.ID] + if mappedIds[ids] { + continue + } + mappedIds[ids] = true if result.Dist == 0 { exactMatches = append(exactMatches, Result{ - IDs: ToIDList(vphash.IDs), + IDs: ToIDList(*v.ids[storedHash.ID]), Distance: int(result.Dist), - Hash: vphash.Hash, + Hash: storedHash.Hash, }) } else { matches = append(matches, Result{ - IDs: ToIDList(vphash.IDs), + IDs: ToIDList(*v.ids[storedHash.ID]), Distance: int(result.Dist), - Hash: vphash.Hash, + Hash: storedHash.Hash, }) } } } - if len(exactMatches) > 0 && exactOnly { + if exactOnly { return exactMatches, nil } - matches = append(exactMatches[:len(exactMatches):len(exactMatches)], matches...) + exactMatches = append(exactMatches, matches...) return matches, nil } +func (v *VPTree) getCurrentTree(kind goimagehash.Kind) *vptree.Tree { + if kind == goimagehash.AHash { + return v.aTree + } + if kind == goimagehash.DHash { + return v.dTree + } + if kind == goimagehash.PHash { + return v.pTree + } + panic("Unknown hash type: " + kind.String()) +} + func (v *VPTree) MapHashes(ImageHash) { panic("Not Implemented") } func (v *VPTree) DecodeHashes(hashes SavedHashes) error { - var err error - for hashType, sourceHashes := range hashes.Hashes { - for hash, idsLocation := range sourceHashes { - var ( - hashKind = goimagehash.Kind(hashType + 1) - ) - hash := &VPHash{Hash{hash, hashKind}, hashes.IDs[idsLocation]} - v.hashes[hashType] = append(v.hashes[hashType], hash) + + // Initialize all the known equal IDs + for _, ids := range hashes.IDs { + for _, id := range ids { + v.ids[id] = &ids } } - for hashType := range 3 { - v.trees[hashType], err = vptree.New(v.hashes[hashType], 3, nil) - if err != nil { - return err + var err error + for _, savedHash := range hashes.Hashes { + if savedHash.Hash.Kind == goimagehash.AHash { + v.aHashes = append(v.aHashes, &VPHash{savedHash}) } + if savedHash.Hash.Kind == goimagehash.DHash { + v.dHashes = append(v.dHashes, &VPHash{savedHash}) + } + if savedHash.Hash.Kind == goimagehash.PHash { + v.pHashes = append(v.pHashes, &VPHash{savedHash}) + } + + if savedHash.ID == (ID{}) { + fmt.Println("Empty ID detected") + panic(savedHash) + } + // All known equal IDs are already mapped we can add any missing ones from hashes + if _, ok := v.ids[savedHash.ID]; !ok { + v.ids[savedHash.ID] = &[]ID{savedHash.ID} + } + } + + v.aTree, err = vptree.New(v.aHashes, 3, nil) + if err != nil { + return err + } + v.dTree, err = vptree.New(v.dHashes, 3, nil) + if err != nil { + return err + } + v.pTree, err = vptree.New(v.pHashes, 3, nil) + if err != nil { + return err } return nil } @@ -90,16 +146,31 @@ func (v *VPTree) AssociateIDs(newIDs []NewIDs) error { } func (v *VPTree) GetIDs(id ID) IDList { - return nil + ids, found := v.ids[id] + if !found { + return nil + } + return ToIDList(*ids) } func NewVPStorage() (HashStorage, error) { - - return &VPTree{ - hashes: [3][]vptree.Comparable{ - make([]vptree.Comparable, 0, 1_000_000), - make([]vptree.Comparable, 0, 1_000_000), - make([]vptree.Comparable, 0, 1_000_000), - }, - }, nil + var err error + v := &VPTree{ + aHashes: []vptree.Comparable{}, + dHashes: []vptree.Comparable{}, + pHashes: []vptree.Comparable{}, + } + v.aTree, err = vptree.New(v.aHashes, 3, nil) + if err != nil { + return v, err + } + v.dTree, err = vptree.New(v.dHashes, 3, nil) + if err != nil { + return v, err + } + v.pTree, err = vptree.New(v.pHashes, 3, nil) + if err != nil { + return v, err + } + return v, nil }