Compare commits

...

4 Commits

Author SHA1 Message Date
Timmy Welch
0928ed6ccf Optimize memory usage
Add a basic map storage that does manual searches to conserve memory
Change saved hashes format to allow multiple hashes for a given ID
Add a vptree storage

Maps in Go take up a huge amount of space changing IDList to []ID took
  memory from over 1GB down to 200MB (note this was on aarch64 MacOS
  which for some reason uses less memory than aarch64 Linux).
  Exhaustive searches using slices took about 30 ms search now takes
  50-60 ms as it takes longer to iterate a map. Partial hashes will
  speed up searches to 8 ms at the cost of 700MB initial memory usage
  and 400MB idle (though this is on MacOS, which for some reason uses
  less memory that aarch64 Linux so probably more like
  900MB initial -> 600 MB idle on an RPI running Linux)
2024-09-07 14:51:18 -07:00
Timmy Welch
b1de95021a Add cli flag 2024-09-02 15:35:36 -07:00
Timmy Welch
1955444dcf Add sqlite implementation 2024-09-01 18:14:19 -07:00
Timmy Welch
0069ffd5cb Make runtime hash storage modular 2024-09-01 18:13:47 -07:00
13 changed files with 1322 additions and 374 deletions

View File

@ -13,7 +13,7 @@ repos:
- id: go-imports
args: [-w]
- repo: https://github.com/golangci/golangci-lint
rev: v1.59.1
rev: v1.60.3
hooks:
- id: golangci-lint
- repo: https://github.com/asottile/setup-cfg-fmt

151
BasicMap.go Normal file
View File

@ -0,0 +1,151 @@
package ch
import (
"fmt"
"math/bits"
"sync"
"gitea.narnian.us/lordwelch/goimagehash"
)
type basicMapStorage struct {
hashMutex sync.RWMutex
ids map[ID]*[]ID
hashes [3]map[uint64]*[]ID
}
func (b *basicMapStorage) Atleast(hashKind goimagehash.Kind, maxDistance int, searchHash uint64) []Result {
hashType := int(hashKind) - 1
matchingHashes := make([]Result, 0, 100) // hope that we don't need all of them
for storedHash, ids := range b.hashes[hashType] {
distance := bits.OnesCount64(searchHash ^ storedHash)
if distance <= maxDistance {
matchingHashes = append(matchingHashes, Result{ToIDList(*ids), distance, Hash{storedHash, hashKind}})
}
}
return matchingHashes
}
func (b *basicMapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
var foundMatches []Result
b.hashMutex.RLock()
defer b.hashMutex.RUnlock()
resetTime()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
for _, hash := range hashes {
hashType := int(hash.Kind) - 1
ids := b.hashes[hashType][hash.Hash]
if ids != nil && len(*ids) > 0 {
foundMatches = append(foundMatches, Result{
Distance: 0,
Hash: hash,
IDs: ToIDList(*ids),
})
}
}
// If we have exact matches don't bother with other matches
if len(foundMatches) > 0 && exactOnly {
return foundMatches, nil
}
logTime("Search Exact")
}
foundHashes := make(map[uint64]struct{})
totalPartialHashes := 0
for _, hash := range hashes {
for _, match := range b.Atleast(hash.Kind, max, hash.Hash) {
_, alreadyMatched := foundHashes[match.Hash.Hash]
if alreadyMatched {
continue
}
foundHashes[match.Hash.Hash] = struct{}{}
foundMatches = append(foundMatches, match)
}
}
fmt.Println("Total partial hashes tested:", totalPartialHashes, len(foundHashes))
logTime("Search Complete")
go b.printSizes()
return foundMatches, nil
}
func (b *basicMapStorage) MapHashes(hash ImageHash) {
for _, ih := range hash.Hashes {
var (
hashType = int(ih.Kind) - 1
)
*b.hashes[hashType][ih.Hash] = InsertID((*b.hashes[hashType][ih.Hash]), hash.ID)
}
}
func (b *basicMapStorage) DecodeHashes(hashes SavedHashes) error {
for hashType, sourceHashes := range hashes.Hashes {
b.hashes[hashType] = make(map[uint64]*[]ID, len(sourceHashes))
for savedHash, idlistLocation := range sourceHashes {
b.hashes[hashType][savedHash] = &hashes.IDs[idlistLocation]
}
}
b.printSizes()
return nil
}
func (b *basicMapStorage) printSizes() {
// fmt.Println("Size of", "hashes:", size.Of(b.hashes)/1024/1024, "MB")
// fmt.Println("Size of", "ids:", size.Of(b.ids)/1024/1024, "MB")
// fmt.Println("Size of", "basicMapStorage:", size.Of(b)/1024/1024, "MB")
}
func (b *basicMapStorage) EncodeHashes() (SavedHashes, error) {
hashes := SavedHashes{}
idmap := map[*[]ID]int{}
for _, ids := range b.ids {
if _, ok := idmap[ids]; ok {
continue
}
hashes.IDs = append(hashes.IDs, *ids)
idmap[ids] = len(hashes.IDs)
}
for hashType, hashToID := range b.hashes {
for hash, ids := range hashToID {
hashes.Hashes[hashType][hash] = idmap[ids]
}
}
return hashes, nil
}
func (b *basicMapStorage) AssociateIDs(newids []NewIDs) {
for _, newid := range newids {
ids, found := b.ids[newid.OldID]
if !found {
msg := "No IDs belonging to " + newid.OldID.Domain + "exist on this server"
panic(msg)
}
*ids = InsertID(*ids, newid.NewID)
}
}
func (b *basicMapStorage) GetIDs(id ID) IDList {
ids, found := b.ids[id]
if !found {
msg := "No IDs belonging to " + id.Domain + "exist on this server"
panic(msg)
}
return ToIDList(*ids)
}
func NewBasicMapStorage() (HashStorage, error) {
storage := &basicMapStorage{
hashMutex: sync.RWMutex{},
hashes: [3]map[uint64]*[]ID{
make(map[uint64]*[]ID),
make(map[uint64]*[]ID),
make(map[uint64]*[]ID),
},
}
return storage, nil
}

View File

@ -29,9 +29,10 @@ import (
"sync"
"time"
"github.com/kr/pretty"
"github.com/vmihailenco/msgpack/v5"
"github.com/disintegration/imaging"
_ "golang.org/x/image/tiff"
_ "golang.org/x/image/vp8"
_ "golang.org/x/image/vp8l"
@ -39,37 +40,20 @@ import (
ch "gitea.narnian.us/lordwelch/comic-hasher"
"gitea.narnian.us/lordwelch/goimagehash"
// "github.com/google/uuid"
// "github.com/zitadel/oidc/pkg/client/rp"
// httphelper "github.com/zitadel/oidc/pkg/http"
// "github.com/zitadel/oidc/pkg/oidc"
)
type Server struct {
httpServer *http.Server
mux *http.ServeMux
BaseURL *url.URL
// token chan<- *oidc.Tokens
// Partial hashes are a uint64 split into 8 pieces or a unint64 for quick lookup, the value is an index to covers
PartialAhash [8]map[uint8][]uint64
PartialDhash [8]map[uint8][]uint64
PartialPhash [8]map[uint8][]uint64
FullAhash map[uint64][]string // Maps ahash's to lists of ID's domain:id
FullDhash map[uint64][]string // Maps dhash's to lists of ID's domain:id
FullPhash map[uint64][]string // Maps phash's to lists of ID's domain:id
ids map[ch.Source]map[string]struct{}
hashMutex sync.RWMutex
httpServer *http.Server
mux *http.ServeMux
BaseURL *url.URL
hashes ch.HashStorage
quit chan struct{}
signalQueue chan os.Signal
readerQueue chan string
hashingQueue chan ch.Im
mappingQueue chan ch.Hash
mappingQueue chan ch.ImageHash
}
// var key = []byte(uuid.New().String())[:16]
type savedHashes map[ch.Source]map[string][3]uint64
type Format int
const (
@ -94,9 +78,6 @@ func (f Format) String() string {
return "Unknown"
}
type Encoder func(any) ([]byte, error)
type Decoder func([]byte, interface{}) error
func (f *Format) Set(s string) error {
if format, known := formatValues[strings.ToLower(s)]; known {
*f = format
@ -106,27 +87,76 @@ func (f *Format) Set(s string) error {
return nil
}
type Storage int
const (
Map = iota + 1
BasicMap
Sqlite
Sqlite3
VPTree
)
var storageNames = map[Storage]string{
Map: "map",
BasicMap: "basicmap",
Sqlite: "sqlite",
Sqlite3: "sqlite3",
VPTree: "vptree",
}
var storageValues = map[string]Storage{
"map": Map,
"basicmap": BasicMap,
"sqlite": Sqlite,
"sqlite3": Sqlite3,
"vptree": VPTree,
}
func (f Storage) String() string {
if name, known := storageNames[f]; known {
return name
}
return "Unknown"
}
func (f *Storage) Set(s string) error {
if storage, known := storageValues[strings.ToLower(s)]; known {
*f = storage
} else {
return fmt.Errorf("Unknown storage type: %d", f)
}
return nil
}
type Encoder func(any) ([]byte, error)
type Decoder func([]byte, interface{}) error
type Opts struct {
cpuprofile string
coverPath string
sqlitePath string
loadEmbeddedHashes bool
saveEmbeddedHashes bool
format Format
hashesPath string
storageType Storage
}
func main() {
opts := Opts{format: Msgpack} // flag is weird
opts := Opts{format: Msgpack, storageType: BasicMap} // flag is weird
go func() {
log.Println(http.ListenAndServe("localhost:6060", nil))
}()
flag.StringVar(&opts.cpuprofile, "cpuprofile", "", "Write cpu profile to file")
flag.StringVar(&opts.coverPath, "cover-path", "", "Path to covers to add to hash database. must be in the form '{cover-path}/{domain}/{id}/*' eg for --cover-path /covers it should look like /covers/comicvine.gamespot.com/10000/image.gif")
flag.StringVar(&opts.sqlitePath, "sqlite-path", "tmp.sqlite", "Path to sqlite database to use for matching hashes, substantialy reduces memory usage")
flag.BoolVar(&opts.loadEmbeddedHashes, "use-embedded-hashes", true, "Use hashes embedded in the application as a starting point")
flag.BoolVar(&opts.saveEmbeddedHashes, "save-embedded-hashes", false, "Save hashes even if we loaded the embedded hashes")
flag.StringVar(&opts.hashesPath, "hashes", "hashes.gz", "Path to optionally gziped hashes in msgpack or json format. You must disable embedded hashes to use this option")
flag.Var(&opts.format, "save-format", "Specify the format to export hashes to (json, msgpack)")
flag.Var(&opts.storageType, "storage-type", "Specify the storage type used internally to search hashes (sqlite,sqlite3,map,basicmap,vptree)")
flag.Parse()
if opts.coverPath != "" {
@ -135,6 +165,8 @@ func main() {
panic(err)
}
}
opts.sqlitePath, _ = filepath.Abs(opts.sqlitePath)
log.Println(pretty.Formatter(opts))
startServer(opts)
}
@ -224,25 +256,25 @@ func (s *Server) associateIDs(w http.ResponseWriter, r *http.Request) {
writeJson(w, http.StatusBadRequest, result{Msg: msg})
return
}
if _, domainExists := s.ids[ch.Source(domain)]; !domainExists {
msg := "No IDs belonging to " + domain + "exist on this server"
log.Println(msg)
writeJson(w, http.StatusBadRequest, result{Msg: msg})
}
// if _, domainExists := s.ids[ch.Source(domain)]; !domainExists {
// msg := "No IDs belonging to " + domain + "exist on this server"
// log.Println(msg)
// writeJson(w, http.StatusBadRequest, result{Msg: msg})
// }
log.Printf("Attempting to associate %s:%s to %s:%s", domain, ID, newDomain, newID)
found := false
for _, hash := range []map[uint64][]string{s.FullAhash, s.FullDhash, s.FullPhash} {
for i, idlist := range hash {
if _, found_in_hash := slices.BinarySearch(idlist, domain+":"+ID); found_in_hash {
found = true
hash[i] = ch.Insert(idlist, newDomain+":"+newID)
if _, ok := s.ids[ch.Source(newDomain)]; !ok {
s.ids[ch.Source(newDomain)] = make(map[string]struct{})
}
s.ids[ch.Source(newDomain)][newID] = struct{}{}
}
}
}
// for _, hash := range []map[uint64][]string{s.FullAhash, s.FullDhash, s.FullPhash} {
// for i, idlist := range hash {
// if _, found_in_hash := slices.BinarySearch(idlist, domain+":"+ID); found_in_hash {
// found = true
// hash[i] = ch.Insert(idlist, newDomain+":"+newID)
// if _, ok := s.ids[ch.Source(newDomain)]; !ok {
// s.ids[ch.Source(newDomain)] = make(map[string]struct{})
// }
// s.ids[ch.Source(newDomain)][newID] = struct{}{}
// }
// }
// }
if found {
writeJson(w, http.StatusOK, result{Msg: "New ID added"})
} else {
@ -250,70 +282,6 @@ func (s *Server) associateIDs(w http.ResponseWriter, r *http.Request) {
}
}
func (s *Server) getMatches(ahash, dhash, phash uint64, max int, skipNonExact bool) []ch.Result {
var foundMatches []ch.Result
s.hashMutex.RLock()
defer s.hashMutex.RUnlock()
if skipNonExact { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
if matchedResults, ok := s.FullAhash[ahash]; ok && ahash != 0 {
foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: 0, Hash: ch.ImageHash{Hash: ahash, Kind: goimagehash.AHash}})
}
if matchedResults, ok := s.FullDhash[dhash]; ok && dhash != 0 {
foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: 0, Hash: ch.ImageHash{Hash: dhash, Kind: goimagehash.DHash}})
}
if matchedResults, ok := s.FullPhash[phash]; ok && phash != 0 {
foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: 0, Hash: ch.ImageHash{Hash: phash, Kind: goimagehash.PHash}})
}
// If we have exact matches don't bother with other matches
if len(foundMatches) > 0 && skipNonExact {
return foundMatches
}
}
foundHashes := make(map[uint64]struct{})
if ahash != 0 {
for i, partialHash := range ch.SplitHash(ahash) {
for _, match := range ch.Atleast(max, ahash, s.PartialAhash[i][partialHash]) {
_, alreadyMatched := foundHashes[match.Hash]
if matchedResults, ok := s.FullAhash[match.Hash]; ok && !alreadyMatched {
foundHashes[match.Hash] = struct{}{}
foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: match.Distance, Hash: ch.ImageHash{Hash: match.Hash, Kind: goimagehash.AHash}})
}
}
}
}
foundHashes = make(map[uint64]struct{})
if dhash != 0 {
for i, partialHash := range ch.SplitHash(dhash) {
for _, match := range ch.Atleast(max, dhash, s.PartialDhash[i][partialHash]) {
_, alreadyMatched := foundHashes[match.Hash]
if matchedResults, ok := s.FullDhash[match.Hash]; ok && !alreadyMatched {
foundHashes[match.Hash] = struct{}{}
foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: match.Distance, Hash: ch.ImageHash{Hash: match.Hash, Kind: goimagehash.DHash}})
}
}
}
}
foundHashes = make(map[uint64]struct{})
if phash != 0 {
for i, partialHash := range ch.SplitHash(phash) {
for _, match := range ch.Atleast(max, phash, s.PartialPhash[i][partialHash]) {
_, alreadyMatched := foundHashes[match.Hash]
if matchedResults, ok := s.FullPhash[match.Hash]; ok && !alreadyMatched {
foundHashes[match.Hash] = struct{}{}
foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: match.Distance, Hash: ch.ImageHash{Hash: match.Hash, Kind: goimagehash.PHash}})
}
}
}
}
return foundMatches
}
type SimpleResult struct {
Distance int
IDList ch.IDList
@ -323,67 +291,31 @@ func getSimpleResults(fullResults []ch.Result) []SimpleResult {
simpleResult := make([]SimpleResult, 0, len(fullResults))
slices.SortFunc(fullResults, func(a, b ch.Result) int {
return cmp.Compare(a.Distance, b.Distance)
return cmp.Compare(a.Distance, b.Distance) * -1 // Reverses sort
})
// Deduplicate IDs
idToDistance := make(map[string]int)
distance := make(map[int]SimpleResult)
for _, fullResult := range fullResults {
for _, id := range fullResult.IDs {
if distance, ok := idToDistance[id]; !ok || fullResult.Distance < distance {
idToDistance[id] = fullResult.Distance
simple, ok := distance[fullResult.Distance]
if !ok {
simple.IDList = make(ch.IDList)
}
for source, ids := range fullResult.IDs {
for _, id := range ids {
simple.IDList[source] = ch.Insert(simple.IDList[source], id)
}
}
}
// Group by distance
distanceMap := make(map[int]SimpleResult)
for id, distance := range idToDistance {
var (
sr SimpleResult
ok bool
)
if sr, ok = distanceMap[distance]; !ok {
sr.IDList = make(ch.IDList)
}
sourceID := strings.SplitN(id, ":", 2)
sr.Distance = distance
sr.IDList[ch.Source(sourceID[0])] = append(sr.IDList[ch.Source(sourceID[0])], sourceID[1])
distanceMap[distance] = sr
}
// turn into array
for _, sr := range distanceMap {
for _, sr := range distance {
simpleResult = append(simpleResult, sr)
}
return simpleResult
}
type APIResult struct {
IDList ch.IDList
Distance int
Hash ch.ImageHash
}
func getResults(fullResults []ch.Result) []APIResult {
apiResults := make([]APIResult, 0, len(fullResults))
for _, res := range fullResults {
idlist := make(ch.IDList)
for _, id := range res.IDs {
sourceID := strings.SplitN(id, ":", 2)
idlist[ch.Source(sourceID[0])] = append(idlist[ch.Source(sourceID[0])], sourceID[1])
}
apiResults = append(apiResults,
APIResult{
Distance: res.Distance,
Hash: res.Hash,
IDList: idlist,
},
)
}
return apiResults
}
type result struct {
Results any `json:"results,omitempty"`
Msg string `json:"msg,omitempty"`
@ -411,19 +343,20 @@ func (s *Server) matchCoverHash(w http.ResponseWriter, r *http.Request) {
return
}
var (
values = r.URL.Query()
ahashStr = strings.TrimSpace(values.Get("ahash"))
dhashStr = strings.TrimSpace(values.Get("dhash"))
phashStr = strings.TrimSpace(values.Get("phash"))
maxStr = strings.TrimSpace(values.Get("max"))
skipNonExact = strings.ToLower(strings.TrimSpace(values.Get("skipNonExact"))) != "false"
simple = strings.ToLower(strings.TrimSpace(values.Get("simple"))) == "true"
ahash uint64
dhash uint64
phash uint64
max int = 8
max_tmp int
err error
values = r.URL.Query()
ahashStr = strings.TrimSpace(values.Get("ahash"))
dhashStr = strings.TrimSpace(values.Get("dhash"))
phashStr = strings.TrimSpace(values.Get("phash"))
maxStr = strings.TrimSpace(values.Get("max"))
exactOnly = strings.ToLower(strings.TrimSpace(values.Get("exactOnly"))) != "false"
simple = strings.ToLower(strings.TrimSpace(values.Get("simple"))) == "true"
ahash uint64
dhash uint64
phash uint64
max int = 8
max_tmp int
err error
hashes []ch.Hash
)
if ahash, err = strconv.ParseUint(ahashStr, 16, 64); err != nil && ahashStr != "" {
@ -431,16 +364,25 @@ func (s *Server) matchCoverHash(w http.ResponseWriter, r *http.Request) {
writeJson(w, http.StatusBadRequest, result{Msg: "hash parse failed"})
return
}
if ahash > 0 {
hashes = append(hashes, ch.Hash{ahash, goimagehash.AHash})
}
if dhash, err = strconv.ParseUint(dhashStr, 16, 64); err != nil && dhashStr != "" {
log.Printf("could not parse dhash: %s", dhashStr)
writeJson(w, http.StatusBadRequest, result{Msg: "hash parse failed"})
return
}
if dhash > 0 {
hashes = append(hashes, ch.Hash{dhash, goimagehash.DHash})
}
if phash, err = strconv.ParseUint(phashStr, 16, 64); err != nil && phashStr != "" {
log.Printf("could not parse phash: %s", phashStr)
writeJson(w, http.StatusBadRequest, result{Msg: "hash parse failed"})
return
}
if phash > 0 {
hashes = append(hashes, ch.Hash{phash, goimagehash.PHash})
}
if max_tmp, err = strconv.Atoi(maxStr); err != nil && maxStr != "" {
log.Printf("Invalid Max: %s", maxStr)
writeJson(w, http.StatusBadRequest, result{Msg: fmt.Sprintf("Invalid Max: %s", maxStr)})
@ -455,13 +397,27 @@ func (s *Server) matchCoverHash(w http.ResponseWriter, r *http.Request) {
writeJson(w, http.StatusBadRequest, result{Msg: fmt.Sprintf("Max must be less than 9: %d", max)})
return
}
matches := s.getMatches(ahash, dhash, phash, max, skipNonExact)
matches, err := s.hashes.GetMatches(hashes, max, exactOnly)
slices.SortFunc(matches, func(a ch.Result, b ch.Result) int {
return cmp.Compare(a.Distance, b.Distance)
})
log.Println(err)
if len(matches) > 0 {
var msg string = ""
if err != nil {
msg = err.Error()
}
if simple {
writeJson(w, http.StatusOK, result{Results: getSimpleResults(matches)})
writeJson(w, http.StatusOK, result{
Results: getSimpleResults(matches),
Msg: msg,
})
return
}
writeJson(w, http.StatusOK, result{Results: getResults(matches)})
writeJson(w, http.StatusOK, result{
Results: matches,
Msg: msg,
})
return
}
@ -503,69 +459,14 @@ func (s *Server) addCover(w http.ResponseWriter, r *http.Request) {
return
default:
}
s.hashingQueue <- ch.Im{Im: i, Format: format, Domain: ch.Source(domain), ID: ID, Path: ""}
s.hashingQueue <- ch.Im{Im: i, Format: format, ID: ch.ID{Domain: ch.Source(domain), ID: ID}, Path: ""}
writeJson(w, http.StatusOK, result{Msg: "Success"})
}
func (s *Server) MapHashes(hash ch.Hash) {
s.hashMutex.Lock()
defer s.hashMutex.Unlock()
s.mapHashes(hash.Ahash.GetHash(), hash.Dhash.GetHash(), hash.Phash.GetHash(), hash.Domain, hash.ID)
}
func (s *Server) mapHashes(ahash, dhash, phash uint64, domain ch.Source, id string) {
if _, ok := s.ids[domain]; !ok {
s.ids[domain] = make(map[string]struct{})
}
s.ids[domain][id] = struct{}{}
if _, ok := s.FullAhash[ahash]; !ok {
s.FullAhash[ahash] = make([]string, 0, 3)
}
s.FullAhash[ahash] = ch.Insert(s.FullAhash[ahash], string(domain)+":"+id)
if _, ok := s.FullDhash[dhash]; !ok {
s.FullDhash[dhash] = make([]string, 0, 3)
}
s.FullDhash[dhash] = ch.Insert(s.FullDhash[dhash], string(domain)+":"+id)
if _, ok := s.FullPhash[phash]; !ok {
s.FullPhash[phash] = make([]string, 0, 3)
}
s.FullPhash[phash] = ch.Insert(s.FullPhash[phash], string(domain)+":"+id)
for i, partialHash := range ch.SplitHash(ahash) {
s.PartialAhash[i][partialHash] = append(s.PartialAhash[i][partialHash], ahash)
}
for i, partialHash := range ch.SplitHash(dhash) {
s.PartialDhash[i][partialHash] = append(s.PartialDhash[i][partialHash], dhash)
}
for i, partialHash := range ch.SplitHash(phash) {
s.PartialPhash[i][partialHash] = append(s.PartialPhash[i][partialHash], phash)
}
}
func (s *Server) initHashes() {
for i := range s.PartialAhash {
s.PartialAhash[i] = make(map[uint8][]uint64)
}
for i := range s.PartialDhash {
s.PartialDhash[i] = make(map[uint8][]uint64)
}
for i := range s.PartialPhash {
s.PartialPhash[i] = make(map[uint8][]uint64)
}
s.FullAhash = make(map[uint64][]string)
s.FullDhash = make(map[uint64][]string)
s.FullPhash = make(map[uint64][]string)
s.ids = make(map[ch.Source]map[string]struct{})
}
func (s *Server) mapper(done func()) {
defer done()
for hash := range s.mappingQueue {
s.MapHashes(hash)
s.hashes.MapHashes(hash)
}
}
@ -575,7 +476,7 @@ func (s *Server) hasher(workerID int, done func()) {
start := time.Now()
hash := ch.HashImage(image)
if hash.Domain == "" {
if hash.ID.Domain == "" || hash.ID.ID == "" {
continue
}
@ -588,7 +489,7 @@ func (s *Server) hasher(workerID int, done func()) {
}
elapsed := time.Since(start)
log.Printf("Hashing took %v: worker: %v. path: %s ahash: %064b id: %s\n", elapsed, workerID, image.Path, hash.Ahash.GetHash(), hash.ID)
log.Printf("Hashing took %v: worker: %v. path: %s %s: %064b id: %s\n", elapsed, workerID, image.Path, hash.Hashes[0].Kind, hash.Hashes[0].Hash, hash.ID)
}
}
@ -605,7 +506,11 @@ func (s *Server) reader(workerID int, done func()) {
}
file.Close()
im := ch.Im{Im: i, Format: format, Domain: ch.Source(filepath.Base(filepath.Dir(filepath.Dir(path)))), ID: filepath.Base(filepath.Dir(path)), Path: path}
im := ch.Im{
Im: i, Format: format,
ID: ch.ID{Domain: ch.Source(filepath.Base(filepath.Dir(filepath.Dir(path)))), ID: filepath.Base(filepath.Dir(path))},
Path: path,
}
select {
case <-s.quit:
log.Println("Recieved quit")
@ -616,94 +521,48 @@ func (s *Server) reader(workerID int, done func()) {
}
}
func (s *Server) encodeHashes(e Encoder) ([]byte, error) {
hashes := make(savedHashes)
for source, ids := range s.ids {
hashes[source] = make(map[string][3]uint64, len(ids))
}
for hash, idlist := range s.FullAhash {
for _, id := range idlist {
sourceID := strings.SplitN(id, ":", 2)
h := hashes[ch.Source(sourceID[0])][sourceID[1]]
h[0] = hash
hashes[ch.Source(sourceID[0])][sourceID[1]] = h
}
}
for hash, idlist := range s.FullDhash {
for _, id := range idlist {
sourceID := strings.SplitN(id, ":", 2)
h := hashes[ch.Source(sourceID[0])][sourceID[1]]
h[1] = hash
hashes[ch.Source(sourceID[0])][sourceID[1]] = h
}
}
for hash, idlist := range s.FullPhash {
for _, id := range idlist {
sourceID := strings.SplitN(id, ":", 2)
h := hashes[ch.Source(sourceID[0])][sourceID[1]]
h[2] = hash
hashes[ch.Source(sourceID[0])][sourceID[1]] = h
}
}
return e(hashes)
}
// EncodeHashes must have a lock to s.hashMutex
func (s *Server) EncodeHashes(format Format) ([]byte, error) {
var encoder Encoder
switch format {
case Msgpack:
return s.encodeHashes(msgpack.Marshal)
encoder = msgpack.Marshal
case JSON:
return s.encodeHashes(json.Marshal)
encoder = json.Marshal
default:
return nil, fmt.Errorf("Unknown format: %v", format)
}
}
func (s *Server) decodeHashes(d Decoder, hashes []byte) error {
loadedHashes := make(savedHashes)
err := d(hashes, &loadedHashes)
hashes, err := s.hashes.EncodeHashes()
if err != nil {
return err
return nil, err
}
for domain, ids := range loadedHashes {
for id := range ids {
if _, ok := s.ids[domain]; ok {
s.ids[domain][id] = struct{}{}
} else {
s.ids[domain] = make(map[string]struct{})
}
}
}
for _, sourceHashes := range loadedHashes {
s.FullAhash = make(map[uint64][]string, len(sourceHashes))
s.FullDhash = make(map[uint64][]string, len(sourceHashes))
s.FullPhash = make(map[uint64][]string, len(sourceHashes))
break
}
for domain, sourceHashes := range loadedHashes {
for id, h := range sourceHashes {
s.mapHashes(h[0], h[1], h[2], domain, id)
}
}
return nil
return encoder(hashes)
}
// DecodeHashes must have a lock to s.hashMutex
func (s *Server) DecodeHashes(format Format, hashes []byte) error {
var decoder Decoder
switch format {
case Msgpack:
return s.decodeHashes(msgpack.Unmarshal, hashes)
decoder = msgpack.Unmarshal
case JSON:
return s.decodeHashes(json.Unmarshal, hashes)
decoder = json.Unmarshal
default:
return fmt.Errorf("Unknown format: %v", format)
}
loadedHashes := ch.SavedHashes{}
err := decoder(hashes, &loadedHashes)
if err != nil || len(loadedHashes.IDs) == 0 {
fmt.Println("Failed to load hashes, checking if they are old hashes", err)
oldHashes := make(ch.OldSavedHashes)
if err = decoder(hashes, &oldHashes); err != nil {
return err
}
loadedHashes = ch.ConvertSavedHashes(oldHashes)
}
return s.hashes.DecodeHashes(loadedHashes)
}
func (s *Server) HashLocalImages(opts Opts) {
@ -718,10 +577,10 @@ func (s *Server) HashLocalImages(opts Opts) {
log.Println("Recieved quit")
}
err := s.httpServer.Shutdown(context.TODO())
fmt.Println("Err:", err)
log.Println("Err:", err)
return
}
fmt.Println("Hashing covers at ", opts.coverPath)
log.Println("Hashing covers at ", opts.coverPath)
start := time.Now()
err := filepath.WalkDir(opts.coverPath, func(path string, d fs.DirEntry, err error) error {
if err != nil {
@ -747,7 +606,7 @@ func (s *Server) HashLocalImages(opts Opts) {
return nil
})
elapsed := time.Since(start)
fmt.Println("Err:", err, "local hashing took", elapsed)
log.Println("Err:", err, "local hashing took", elapsed)
sig := <-s.signalQueue
if !alreadyQuit {
@ -758,6 +617,22 @@ func (s *Server) HashLocalImages(opts Opts) {
}()
}
func initializeStorage(opts Opts) (ch.HashStorage, error) {
switch opts.storageType {
case Map:
return ch.NewMapStorage()
case BasicMap:
return ch.NewBasicMapStorage()
case Sqlite:
return ch.NewSqliteStorage("sqlite", opts.sqlitePath)
case Sqlite3:
return ch.NewSqliteStorage("sqlite3", opts.sqlitePath)
case VPTree:
return ch.NewVPStorage()
}
return nil, errors.New("Unknown storage type provided")
}
func startServer(opts Opts) {
if opts.cpuprofile != "" {
f, err := os.Create(opts.cpuprofile)
@ -769,13 +644,13 @@ func startServer(opts Opts) {
}
mux := http.NewServeMux()
server := Server{
// token: make(chan *oidc.Tokens),
quit: make(chan struct{}),
signalQueue: make(chan os.Signal, 1),
readerQueue: make(chan string, 1120130), // Number gotten from checking queue size
readerQueue: make(chan string, 100),
hashingQueue: make(chan ch.Im),
mappingQueue: make(chan ch.Hash),
mappingQueue: make(chan ch.ImageHash),
mux: mux,
httpServer: &http.Server{
Addr: ":8080",
@ -786,29 +661,33 @@ func startServer(opts Opts) {
},
}
Notify(server.signalQueue)
imaging.SetMaxProcs(1)
fmt.Println("init hashes")
server.initHashes()
// server.setupOauthHandlers()
fmt.Println("init handlers")
var err error
log.Println("init hashes")
server.hashes, err = initializeStorage(opts)
if err != nil {
panic(err)
}
log.Println("init handlers")
server.setupAppHandlers()
fmt.Println("init hashers")
log.Println("init hashers")
rwg := sync.WaitGroup{}
for i := range 10 {
rwg.Add(1)
go server.reader(i, func() { fmt.Println("Reader completed"); rwg.Done() })
go server.reader(i, func() { log.Println("Reader completed"); rwg.Done() })
}
hwg := sync.WaitGroup{}
for i := range 10 {
hwg.Add(1)
go server.hasher(i, func() { fmt.Println("Hasher completed"); hwg.Done() })
go server.hasher(i, func() { log.Println("Hasher completed"); hwg.Done() })
}
fmt.Println("init mapper")
log.Println("init mapper")
mwg := sync.WaitGroup{}
mwg.Add(1)
go server.mapper(func() { fmt.Println("Mapper completed"); mwg.Done() })
go server.mapper(func() { log.Println("Mapper completed"); mwg.Done() })
if opts.loadEmbeddedHashes && len(ch.Hashes) != 0 {
var err error
@ -829,7 +708,7 @@ func startServer(opts Opts) {
if err != nil {
panic(fmt.Sprintf("Failed to decode embedded hashes: %s", err))
}
fmt.Printf("Loaded embedded %s hashes ahashes: %d dhashes: %d phashes: %d\n", format, len(server.FullAhash), len(server.FullDhash), len(server.FullPhash))
fmt.Printf("Loaded embedded %s hashes\n", format)
} else {
if f, err := os.Open(opts.hashesPath); err == nil {
var buf io.Reader = f
@ -854,35 +733,35 @@ func startServer(opts Opts) {
if err != nil {
panic(fmt.Sprintf("Failed to decode hashes from disk: %s", err))
}
fmt.Printf("Loaded hashes from %q %s hashes ahashes: %d dhashes: %d phashes: %d\n", opts.hashesPath, format, len(server.FullAhash), len(server.FullDhash), len(server.FullPhash))
fmt.Printf("Loaded hashes from %q %s\n", opts.hashesPath, format)
} else {
if errors.Is(err, os.ErrNotExist) {
fmt.Println("No saved hashes to load")
log.Println("No saved hashes to load")
} else {
fmt.Println("Unable to load saved hashes", err)
log.Println("Unable to load saved hashes", err)
}
}
}
server.HashLocalImages(opts)
fmt.Println("Listening on ", server.httpServer.Addr)
err := server.httpServer.ListenAndServe()
log.Println("Listening on ", server.httpServer.Addr)
err = server.httpServer.ListenAndServe()
if err != nil {
fmt.Println(err)
log.Println(err)
}
close(server.readerQueue)
fmt.Println("waiting on readers")
log.Println("waiting on readers")
rwg.Wait()
for range server.readerQueue {
}
close(server.hashingQueue)
fmt.Println("waiting on hashers")
log.Println("waiting on hashers")
hwg.Wait()
for range server.hashingQueue {
}
close(server.mappingQueue)
fmt.Println("waiting on mapper")
log.Println("waiting on mapper")
mwg.Wait()
for range server.mappingQueue {
}
@ -897,14 +776,14 @@ func startServer(opts Opts) {
gzw := gzip.NewWriter(f)
_, err := gzw.Write(encodedHashes)
if err != nil {
fmt.Println("Failed to write hashes", err)
log.Println("Failed to write hashes", err)
} else {
fmt.Println("Successfully saved hashes")
log.Println("Successfully saved hashes")
}
gzw.Close()
f.Close()
} else {
fmt.Println("Unabled to save hashes", err)
log.Println("Unabled to save hashes", err)
}
} else {
fmt.Printf("Unable to encode hashes as %v: %v", opts.format, err)

17
cmd/comic-hasher/tmp.go Normal file
View File

@ -0,0 +1,17 @@
//go:build main
package main
import (
"fmt"
"time"
)
func main() {
tmp := make([]string, 0, 932456)
for range 932460 {
tmp = append(tmp, "comicvine.gamespot.com:123456")
}
fmt.Println(len(tmp))
time.Sleep(time.Minute)
}

View File

@ -106,9 +106,9 @@ func main() {
debugImage(debugim, 8, 8)
}
hash := ch.HashImage(ch.Im{Im: im, Format: format, Domain: ch.Source(ch.ComicVine), ID: "nothing"})
hash := ch.HashImage(ch.Im{Im: im, Format: format, ID: ch.ID{Domain: ch.Source(ch.ComicVine), ID: "nothing"}})
fmt.Println("ahash: ", hash.Ahash.BinString())
fmt.Println("dhash: ", hash.Dhash.BinString())
fmt.Println("phash: ", hash.Phash.BinString())
fmt.Println("ahash: ", goimagehash.NewImageHash(hash.Hashes[0].Hash, hash.Hashes[0].Kind).BinString())
fmt.Println("dhash: ", goimagehash.NewImageHash(hash.Hashes[1].Hash, hash.Hashes[1].Kind).BinString())
fmt.Println("phash: ", goimagehash.NewImageHash(hash.Hashes[2].Hash, hash.Hashes[2].Kind).BinString())
}

27
go.mod
View File

@ -1,16 +1,18 @@
module gitea.narnian.us/lordwelch/comic-hasher
go 1.22.1
toolchain go1.22.2
go 1.23.0
require (
gitea.narnian.us/lordwelch/goimagehash v0.0.0-20240812025715-33ff96e45f00
github.com/disintegration/imaging v1.6.3-0.20201218193011-d40f48ce0f09
github.com/fmartingr/go-comicinfo/v2 v2.0.2
github.com/kr/pretty v0.1.0
github.com/mattn/go-sqlite3 v1.14.22
github.com/mholt/archiver/v4 v4.0.0-alpha.8
github.com/ncruces/go-sqlite3 v0.18.1
golang.org/x/image v0.19.0
golang.org/x/text v0.17.0
gonum.org/v1/gonum v0.15.1
modernc.org/sqlite v1.32.0
)
require (
@ -24,19 +26,36 @@ require (
github.com/bodgit/sevenzip v1.3.0 // indirect
github.com/bodgit/windows v1.0.0 // indirect
github.com/connesc/cipherio v0.2.1 // indirect
github.com/disintegration/imaging v1.6.3-0.20201218193011-d40f48ce0f09 // indirect
github.com/dsnet/compress v0.0.1 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/golang/mock v1.6.0 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/hashicorp/errwrap v1.0.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/klauspost/compress v1.15.9 // indirect
github.com/klauspost/pgzip v1.2.5 // indirect
github.com/kr/text v0.1.0 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/ncruces/go-strftime v0.1.9 // indirect
github.com/ncruces/julianday v1.0.0 // indirect
github.com/nwaples/rardecode/v2 v2.0.0-beta.2 // indirect
github.com/pierrec/lz4/v4 v4.1.15 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/tetratelabs/wazero v1.8.0 // indirect
github.com/therootcompany/xz v1.0.1 // indirect
github.com/ulikunitz/xz v0.5.10 // indirect
go4.org v0.0.0-20200411211856-f5505b9728dd // indirect
golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa // indirect
golang.org/x/sys v0.24.0 // indirect
modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 // indirect
modernc.org/libc v1.55.3 // indirect
modernc.org/mathutil v1.6.0 // indirect
modernc.org/memory v1.8.0 // indirect
modernc.org/strutil v1.2.0 // indirect
modernc.org/token v1.1.0 // indirect
)
replace golang.org/x/text v0.17.0 => github.com/lordwelch/text v0.0.0-20240505231825-4893f344170f

61
go.sum
View File

@ -42,6 +42,8 @@ github.com/disintegration/imaging v1.6.3-0.20201218193011-d40f48ce0f09/go.mod h1
github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q=
github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo=
github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/fmartingr/go-comicinfo/v2 v2.0.2 h1:VppvrHr8C4+iktBTOd7vzTMNbVecZ7F/Ji1kPTOIGg4=
@ -75,7 +77,11 @@ github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXi
github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20240409012703-83162a5b38cd h1:gbpYu9NMq8jhDVbvlGkMFWCjLFlqqEZjEmObmhUy6Vo=
github.com/google/pprof v0.0.0-20240409012703-83162a5b38cd/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw=
github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
@ -84,6 +90,8 @@ github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+l
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
@ -94,13 +102,25 @@ github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHU
github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE=
github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/lordwelch/text v0.0.0-20240505231825-4893f344170f h1:RMKTfrT4gjJfmB/aWuvCcFxUSvWAJfOAc5khGL6ASjk=
github.com/lordwelch/text v0.0.0-20240505231825-4893f344170f/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/mholt/archiver/v4 v4.0.0-alpha.8 h1:tRGQuDVPh66WCOelqe6LIGh0gwmfwxUrSSDunscGsRM=
github.com/mholt/archiver/v4 v4.0.0-alpha.8/go.mod h1:5f7FUYGXdJWUjESffJaYR4R60VhnHxb2X3T1teMyv5A=
github.com/ncruces/go-sqlite3 v0.18.1 h1:iN8IMZV5EMxpH88NUac9vId23eTKNFUhP7jgY0EBbNc=
github.com/ncruces/go-sqlite3 v0.18.1/go.mod h1:eEOyZnW1dGTJ+zDpMuzfYamEUBtdFz5zeYhqLBtHxvM=
github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4=
github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/ncruces/julianday v1.0.0 h1:fH0OKwa7NWvniGQtxdJRxAgkBMolni2BjDHaWTxqt7M=
github.com/ncruces/julianday v1.0.0/go.mod h1:Dusn2KvZrrovOMJuOt0TNXL6tB7U2E8kvza5fFc9G7g=
github.com/nwaples/rardecode/v2 v2.0.0-beta.2 h1:e3mzJFJs4k83GXBEiTaQ5HgSc/kOK8q0rDaRO0MPaOk=
github.com/nwaples/rardecode/v2 v2.0.0-beta.2/go.mod h1:yntwv/HfMc/Hbvtq9I19D1n58te3h6KsqCf3GxyfBGY=
github.com/pierrec/lz4/v4 v4.1.15 h1:MO0/ucJhngq7299dKLwIMtgTfbkoSPF6AoMYDd8Q4q0=
@ -108,6 +128,8 @@ github.com/pierrec/lz4/v4 v4.1.15/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFu
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd/go.mod h1:hPqNNc0+uJM6H+SuU8sEs5K5IQeKccPqeSjfgcKGgPk=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@ -115,6 +137,8 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/tetratelabs/wazero v1.8.0 h1:iEKu0d4c2Pd+QSRieYbnQC9yiFlMS9D+Jr0LsRmcF4g=
github.com/tetratelabs/wazero v1.8.0/go.mod h1:yAI0XTsMBhREkM/YDAK/zNou3GoiAce1P6+rp/wQhjs=
github.com/therootcompany/xz v1.0.1 h1:CmOtsn1CbtmyYiusbfmhmkpAAETj0wBIH6kCYaX+xzw=
github.com/therootcompany/xz v1.0.1/go.mod h1:3K3UH1yCKgBneZYhuQUvJ9HPD19UEXEI0BWbMn8qNMY=
github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8=
@ -166,6 +190,8 @@ golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@ -193,6 +219,8 @@ golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@ -209,6 +237,9 @@ golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
@ -242,10 +273,14 @@ golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapK
golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0=
gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o=
google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
@ -291,6 +326,32 @@ honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWh
honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
modernc.org/cc/v4 v4.21.4 h1:3Be/Rdo1fpr8GrQ7IVw9OHtplU4gWbb+wNgeoBMmGLQ=
modernc.org/cc/v4 v4.21.4/go.mod h1:HM7VJTZbUCR3rV8EYBi9wxnJ0ZBRiGE5OeGXNA0IsLQ=
modernc.org/ccgo/v4 v4.19.2 h1:lwQZgvboKD0jBwdaeVCTouxhxAyN6iawF3STraAal8Y=
modernc.org/ccgo/v4 v4.19.2/go.mod h1:ysS3mxiMV38XGRTTcgo0DQTeTmAO4oCmJl1nX9VFI3s=
modernc.org/fileutil v1.3.0 h1:gQ5SIzK3H9kdfai/5x41oQiKValumqNTDXMvKo62HvE=
modernc.org/fileutil v1.3.0/go.mod h1:XatxS8fZi3pS8/hKG2GH/ArUogfxjpEKs3Ku3aK4JyQ=
modernc.org/gc/v2 v2.4.1 h1:9cNzOqPyMJBvrUipmynX0ZohMhcxPtMccYgGOJdOiBw=
modernc.org/gc/v2 v2.4.1/go.mod h1:wzN5dK1AzVGoH6XOzc3YZ+ey/jPgYHLuVckd62P0GYU=
modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 h1:5D53IMaUuA5InSeMu9eJtlQXS2NxAhyWQvkKEgXZhHI=
modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6/go.mod h1:Qz0X07sNOR1jWYCrJMEnbW/X55x206Q7Vt4mz6/wHp4=
modernc.org/libc v1.55.3 h1:AzcW1mhlPNrRtjS5sS+eW2ISCgSOLLNyFzRh/V3Qj/U=
modernc.org/libc v1.55.3/go.mod h1:qFXepLhz+JjFThQ4kzwzOjA/y/artDeg+pcYnY+Q83w=
modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4=
modernc.org/mathutil v1.6.0/go.mod h1:Ui5Q9q1TR2gFm0AQRqQUaBWFLAhQpCwNcuhBOSedWPo=
modernc.org/memory v1.8.0 h1:IqGTL6eFMaDZZhEWwcREgeMXYwmW83LYW8cROZYkg+E=
modernc.org/memory v1.8.0/go.mod h1:XPZ936zp5OMKGWPqbD3JShgd/ZoQ7899TUuQqxY+peU=
modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4=
modernc.org/opt v0.1.3/go.mod h1:WdSiB5evDcignE70guQKxYUl14mgWtbClRi5wmkkTX0=
modernc.org/sortutil v1.2.0 h1:jQiD3PfS2REGJNzNCMMaLSp/wdMNieTbKX920Cqdgqc=
modernc.org/sortutil v1.2.0/go.mod h1:TKU2s7kJMf1AE84OoiGppNHJwvB753OYfNl2WRb++Ss=
modernc.org/sqlite v1.32.0 h1:6BM4uGza7bWypsw4fdLRsLxut6bHe4c58VeqjRgST8s=
modernc.org/sqlite v1.32.0/go.mod h1:UqoylwmTb9F+IqXERT8bW9zzOWN8qwAIcLdzeBZs4hA=
modernc.org/strutil v1.2.0 h1:agBi9dp1I+eOnxXeiZawM8F4LawKv4NzGWSaLfyeNZA=
modernc.org/strutil v1.2.0/go.mod h1:/mdcBmfOibveCTBxUl5B5l6W+TTH1FXPLHZE6bTosX0=
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=

View File

@ -50,35 +50,124 @@ type Match struct {
}
type ID struct {
Domain, ID string
}
type Result struct {
IDs []string // domain:id
Distance int
Hash ImageHash
}
type Im struct {
Im image.Image
Format string
Domain Source
ID, Path string
}
type Hash struct {
Ahash *goimagehash.ImageHash
Dhash *goimagehash.ImageHash
Phash *goimagehash.ImageHash
Domain Source
ID string
}
type Result struct {
IDs IDList
Distance int
Hash Hash
}
type Im struct {
Im image.Image
Format string
Path string
ID ID
}
type ImageHash struct {
Hashes []Hash
ID ID
}
type Hash struct {
Hash uint64
Kind goimagehash.Kind
}
// IDList is a map of domain to ID eg IDs["comicvine.gamespot.com"] = []string{"1235"}
// Maps are extremely expensive in go for small maps this should only be used to return info to a user no internal code should use this
type IDList map[Source][]string
type OldSavedHashes map[Source]map[string][3]uint64
type SavedHashes struct {
IDs [][]ID
Hashes [3]map[uint64]int
}
func ToIDList(ids []ID) IDList {
idlist := IDList{}
for _, id := range ids {
idlist[id.Domain] = Insert(idlist[id.Domain], id.ID)
}
return idlist
}
func InsertID(ids []ID, id ID) []ID {
index, itemFound := slices.BinarySearchFunc(ids, id, func(e ID, t ID) int {
return cmp.Or(
cmp.Compare(e.Domain, t.Domain),
cmp.Compare(e.ID, t.ID),
)
})
if itemFound {
return ids
}
return slices.Insert(ids, index, id)
}
func (s *SavedHashes) InsertHash(hash Hash, id ID) {
for i, h := range s.Hashes {
if h == nil {
s.Hashes[i] = make(map[uint64]int)
}
}
hashType := int(hash.Kind) - 1
idx, hashFound := s.Hashes[hashType][hash.Hash]
if !hashFound {
idx = len(s.IDs)
s.IDs = append(s.IDs, make([]ID, 0, 3))
}
s.IDs[idx] = InsertID(s.IDs[idx], id)
s.Hashes[hashType][hash.Hash] = idx
}
func ConvertSavedHashes(oldHashes OldSavedHashes) SavedHashes {
t := SavedHashes{}
idcount := 0
for _, ids := range oldHashes {
idcount += len(ids)
}
t.IDs = make([][]ID, 0, idcount)
t.Hashes[0] = make(map[uint64]int, idcount)
t.Hashes[1] = make(map[uint64]int, idcount)
t.Hashes[2] = make(map[uint64]int, idcount)
for domain, sourceHashes := range oldHashes {
for id, hashes := range sourceHashes {
idx := len(t.IDs)
t.IDs = append(t.IDs, []ID{{domain, id}})
for hashType, hash := range hashes {
t.Hashes[hashType][hash] = idx
}
}
}
fmt.Println("Expected number of IDs", idcount)
idcount = 0
for _, ids := range t.IDs {
idcount += len(ids)
}
fmt.Println("length of hashes", len(t.Hashes[0])+len(t.Hashes[1])+len(t.Hashes[2]))
fmt.Println("Length of ID lists", len(t.IDs))
fmt.Println("Total number of IDs", idcount)
return t
}
type NewIDs struct {
OldID ID
NewID ID
}
type HashStorage interface {
GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error)
MapHashes(ImageHash)
DecodeHashes(hashes SavedHashes) error
EncodeHashes() (SavedHashes, error)
AssociateIDs(newIDs []NewIDs)
GetIDs(id ID) IDList
}
func Atleast(maxDistance int, searchHash uint64, hashes []uint64) []Match {
matchingHashes := make([]Match, 0, len(hashes)/2) // hope that we don't need all of them
for _, storedHash := range hashes {
@ -98,47 +187,49 @@ func Insert[S ~[]E, E cmp.Ordered](slice S, item E) S {
return slices.Insert(slice, index, item)
}
func InsertIdx[S ~[]E, E cmp.Ordered](slice S, item E) (S, int) {
index, itemFound := slices.BinarySearch(slice, item)
if itemFound {
return slice, index
}
return slices.Insert(slice, index, item), index
}
func MemStats() uint64 {
var m runtime.MemStats
runtime.ReadMemStats(&m)
return m.Alloc
}
func HashImage(i Im) Hash {
func HashImage(i Im) ImageHash {
if i.Format == "webp" {
i.Im = goimagehash.FancyUpscale(i.Im.(*image.YCbCr))
}
var (
err error = nil
ahash *goimagehash.ImageHash
dhash *goimagehash.ImageHash
phash *goimagehash.ImageHash
err error
)
ahash, err = goimagehash.AverageHash(i.Im)
ahash, err := goimagehash.AverageHash(i.Im)
if err != nil {
msg := fmt.Sprintf("Failed to ahash Image: %s", err)
log.Println(msg)
return Hash{}
return ImageHash{}
}
dhash, err = goimagehash.DifferenceHash(i.Im)
dhash, err := goimagehash.DifferenceHash(i.Im)
if err != nil {
msg := fmt.Sprintf("Failed to dhash Image: %s", err)
log.Println(msg)
return Hash{}
return ImageHash{}
}
phash, err = goimagehash.PerceptionHash(i.Im)
phash, err := goimagehash.PerceptionHash(i.Im)
if err != nil {
msg := fmt.Sprintf("Failed to phash Image: %s", err)
log.Println(msg)
return Hash{}
return ImageHash{}
}
return Hash{
Ahash: ahash,
Dhash: dhash,
Phash: phash,
Domain: i.Domain,
return ImageHash{
Hashes: []Hash{{ahash.GetHash(), ahash.GetKind()}, {dhash.GetHash(), dhash.GetKind()}, {phash.GetHash(), phash.GetKind()}},
ID: i.ID,
}
}
@ -155,5 +246,3 @@ func SplitHash(hash uint64) [8]uint8 {
uint8((hash & H0) >> Shift0),
}
}
type IDList map[Source][]string // IDs is a map of domain to ID eg IDs['comicvine.gamespot.com'] = []string{"1235"}

147
map.go Normal file
View File

@ -0,0 +1,147 @@
package ch
import (
"fmt"
"slices"
"sync"
)
type MapStorage struct {
basicMapStorage
partialHash [3][8]map[uint8][]uint64
}
func (m *MapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
var foundMatches []Result
m.hashMutex.RLock()
defer m.hashMutex.RUnlock()
resetTime()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
for _, hash := range hashes {
hashType := int(hash.Kind) - 1
idlist := m.hashes[hashType][hash.Hash]
if idlist != nil && len(*idlist) > 0 {
foundMatches = append(foundMatches, Result{
Distance: 0,
Hash: hash,
IDs: ToIDList(*idlist),
})
}
}
// If we have exact matches don't bother with other matches
if len(foundMatches) > 0 && exactOnly {
return foundMatches, nil
}
logTime("Search Exact")
}
totalPartialHashes := 0
for _, searchHash := range hashes {
foundHashes := make(map[uint64]struct{})
hashType := int(searchHash.Kind) - 1
for i, partialHash := range SplitHash(searchHash.Hash) {
partialHashes := m.partialHash[hashType][i][partialHash]
totalPartialHashes += len(partialHashes)
for _, match := range Atleast(max, searchHash.Hash, partialHashes) {
_, alreadyMatched := foundHashes[match.Hash]
if matchedResults, ok := m.hashes[hashType][match.Hash]; ok && !alreadyMatched {
foundHashes[match.Hash] = struct{}{}
foundMatches = append(foundMatches, Result{IDs: ToIDList(*matchedResults), Distance: match.Distance, Hash: Hash{Hash: match.Hash, Kind: searchHash.Kind}})
}
}
}
}
fmt.Println("Total partial hashes tested:", totalPartialHashes)
logTime("Search Complete")
go m.printSizes()
return foundMatches, nil
}
func (m *MapStorage) MapHashes(hash ImageHash) {
m.basicMapStorage.MapHashes(hash)
for _, hash := range hash.Hashes {
hashType := int(hash.Kind) - 1
for i, partialHash := range SplitHash(hash.Hash) {
m.partialHash[hashType][i][partialHash] = Insert(m.partialHash[hashType][i][partialHash], hash.Hash)
}
}
}
func (m *MapStorage) DecodeHashes(hashes SavedHashes) error {
for hashType, sourceHashes := range hashes.Hashes {
m.hashes[hashType] = make(map[uint64]*[]ID, len(sourceHashes))
for savedHash, idlistLocation := range sourceHashes {
for i, partialHash := range SplitHash(savedHash) {
m.partialHash[hashType][i][partialHash] = append(m.partialHash[hashType][i][partialHash], savedHash)
}
m.hashes[hashType][savedHash] = &hashes.IDs[idlistLocation]
}
}
m.printSizes()
for _, partialHashes := range m.partialHash {
for _, partMap := range partialHashes {
for part, hashes := range partMap {
slices.Sort(hashes)
partMap[part] = slices.Compact(hashes)
}
}
}
m.printSizes()
return nil
}
func (m *MapStorage) printSizes() {
fmt.Println("Length of hashes:", len(m.hashes[0])+len(m.hashes[1])+len(m.hashes[2]))
// fmt.Println("Size of", "hashes:", size.Of(m.hashes)/1024/1024, "MB")
// fmt.Println("Size of", "ids:", size.Of(m.ids)/1024/1024, "MB")
// fmt.Println("Size of", "MapStorage:", size.Of(m)/1024/1024, "MB")
}
func NewMapStorage() (HashStorage, error) {
storage := &MapStorage{
basicMapStorage: basicMapStorage{
hashMutex: sync.RWMutex{},
hashes: [3]map[uint64]*[]ID{
make(map[uint64]*[]ID),
make(map[uint64]*[]ID),
make(map[uint64]*[]ID),
},
},
partialHash: [3][8]map[uint8][]uint64{
{
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
},
{
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
},
{
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
make(map[uint8][]uint64),
},
},
}
return storage, nil
}

465
sqlite.go Normal file
View File

@ -0,0 +1,465 @@
package ch
import (
"context"
"database/sql"
"errors"
"fmt"
"log"
"math/bits"
"strings"
"time"
"gitea.narnian.us/lordwelch/goimagehash"
_ "modernc.org/sqlite"
)
type sqliteStorage struct {
db *sql.DB
}
type sqliteHash struct {
hashid int
Result
}
func (s *sqliteStorage) findExactHashes(statement *sql.Stmt, items ...interface{}) ([]sqliteHash, error) { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
hashes := []sqliteHash{}
rows, err := statement.Query(items...)
if err != nil {
return hashes, err
}
for rows.Next() {
var (
r = sqliteHash{Result: Result{IDs: make(IDList)}}
h int64
)
err = rows.Scan(&r.hashid, &h, &r.Hash.Kind)
if err != nil {
rows.Close()
return hashes, err
}
r.Hash.Hash = uint64(h)
hashes = append(hashes, r)
}
rows.Close()
statement, err = s.db.PrepareContext(context.Background(), `SELECT IDS.domain, IDs.id FROM IDs JOIN id_hash ON IDs.rowid = id_hash.idid WHERE (id_hash.hashid=?) ORDER BY IDs.domain, IDs.ID;`)
if err != nil {
return hashes, err
}
for _, hash := range hashes {
rows, err := statement.Query(hash.hashid)
if err != nil {
return hashes, err
}
for rows.Next() {
var source Source
var id string
err := rows.Scan(&source, &id)
if err != nil {
return hashes, err
}
hash.IDs[source] = append(hash.IDs[source], id)
}
rows.Close()
}
return hashes, nil
}
func (s *sqliteStorage) findPartialHashes(max int, search_hash int64, kind goimagehash.Kind) ([]sqliteHash, error) { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
hashes := []sqliteHash{}
statement, err := s.db.PrepareContext(context.Background(), `SELECT rowid,hash,kind FROM Hashes WHERE (kind=?) AND (((hash >> (0 * 8) & 0xFF)=(?2 >> (0 * 8) & 0xFF)) OR ((hash >> (1 * 8) & 0xFF)=(?2 >> (1 * 8) & 0xFF)) OR ((hash >> (2 * 8) & 0xFF)=(?2 >> (2 * 8) & 0xFF)) OR ((hash >> (3 * 8) & 0xFF)=(?2 >> (3 * 8) & 0xFF)) OR ((hash >> (4 * 8) & 0xFF)=(?2 >> (4 * 8) & 0xFF)) OR ((hash >> (5 * 8) & 0xFF)=(?2 >> (5 * 8) & 0xFF)) OR ((hash >> (6 * 8) & 0xFF)=(?2 >> (6 * 8) & 0xFF)) OR ((hash >> (7 * 8) & 0xFF)=(?2 >> (7 * 8) & 0xFF)));`)
if err != nil {
return hashes, err
}
rows, err := statement.Query(kind, int64(search_hash))
if err != nil {
return hashes, err
}
for rows.Next() {
var (
r = sqliteHash{Result: Result{IDs: make(IDList)}}
h int64
)
err = rows.Scan(&r.hashid, &h, &r.Hash.Kind)
if err != nil {
rows.Close()
return hashes, err
}
r.Hash.Hash = uint64(h)
r.Distance = bits.OnesCount64(uint64(search_hash) ^ r.Hash.Hash)
if r.Distance <= max {
hashes = append(hashes, r)
}
}
rows.Close()
logTime("Filter partial " + kind.String())
statement, err = s.db.PrepareContext(context.Background(), `SELECT DISTINCT IDS.domain, IDs.id, id_hash.hashid FROM IDs JOIN id_hash ON IDs.rowid = id_hash.idid WHERE (id_hash.hashid in (`+strings.TrimRight(strings.Repeat("?,", len(hashes)), ",")+`)) ORDER BY IDs.domain, IDs.ID;`)
if err != nil {
return hashes, err
}
var ids []any
for _, hash := range hashes {
ids = append(ids, hash.hashid)
}
rows, err = statement.Query(ids...)
if err != nil {
return hashes, err
}
for rows.Next() {
var source Source
var id string
var hashid int
err := rows.Scan(&source, &id, &hashid)
if err != nil {
return hashes, err
}
for _, hash := range hashes {
if hash.hashid == hashid {
hash.IDs[source] = append(hash.IDs[source], id)
}
}
}
rows.Close()
return hashes, nil
}
func (s *sqliteStorage) dropIndexes() error {
_, err := s.db.Exec(`
DROP INDEX IF EXISTS hash_index;
DROP INDEX IF EXISTS hash_1_index;
DROP INDEX IF EXISTS hash_2_index;
DROP INDEX IF EXISTS hash_3_index;
DROP INDEX IF EXISTS hash_4_index;
DROP INDEX IF EXISTS hash_5_index;
DROP INDEX IF EXISTS hash_6_index;
DROP INDEX IF EXISTS hash_7_index;
DROP INDEX IF EXISTS hash_8_index;
DROP INDEX IF EXISTS id_domain;
`)
if err != nil {
return err
}
return nil
}
func (s *sqliteStorage) createIndexes() error {
_, err := s.db.Exec(`
CREATE INDEX IF NOT EXISTS hash_index ON Hashes (kind, hash);
CREATE INDEX IF NOT EXISTS hash_1_index ON Hashes ((hash >> (0 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_2_index ON Hashes ((hash >> (1 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_3_index ON Hashes ((hash >> (2 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_4_index ON Hashes ((hash >> (3 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_5_index ON Hashes ((hash >> (4 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_6_index ON Hashes ((hash >> (5 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_7_index ON Hashes ((hash >> (6 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS hash_8_index ON Hashes ((hash >> (7 * 8) & 0xFF));
CREATE INDEX IF NOT EXISTS id_domain ON IDs (domain, id);
PRAGMA shrink_memory;
ANALYZE;
`)
if err != nil {
return err
}
return nil
}
var (
total time.Duration
t = time.Now()
)
func resetTime() {
total = 0
t = time.Now()
}
func logTime(log string) {
n := time.Now()
s := n.Sub(t)
t = n
total += s
fmt.Printf("total: %v, %s: %v\n", total, log, s)
}
func (s *sqliteStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
var (
foundMatches []Result
)
resetTime()
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
statement, err := s.db.Prepare(`SELECT rowid,hash,kind FROM Hashes WHERE ` + strings.TrimSuffix(strings.Repeat("(hash=? AND kind=?) OR", len(hashes)), "OR") + `ORDER BY kind,hash;`)
if err != nil {
logTime("Fail exact")
return foundMatches, err
}
args := make([]interface{}, 0, len(hashes)*2)
for _, hash := range hashes {
if hash.Hash != 0 {
args = append(args, int64(hash.Hash), hash.Kind)
}
}
hashes, err := s.findExactHashes(statement, args...)
if err != nil {
return foundMatches, err
}
for _, hash := range hashes {
foundMatches = append(foundMatches, hash.Result)
}
// If we have exact matches don't bother with other matches
if len(foundMatches) > 0 && exactOnly {
return foundMatches, nil
}
logTime("Search Exact")
}
foundHashes := make(map[uint64]struct{})
for _, hash := range hashes {
hashes, err := s.findPartialHashes(max, int64(hash.Hash), hash.Kind)
if err != nil {
return foundMatches, err
}
logTime("Search partial " + hash.Kind.String())
for _, hash := range hashes {
if _, alreadyMatched := foundHashes[hash.Hash.Hash]; !alreadyMatched {
foundHashes[hash.Hash.Hash] = struct{}{}
foundMatches = append(foundMatches, hash.Result)
} else {
log.Println("Hash already found", hash)
}
}
}
return foundMatches, nil
}
func (s *sqliteStorage) MapHashes(hash ImageHash) {
tx, err := s.db.BeginTx(context.Background(), nil)
if err != nil {
panic(err)
}
insertHashes, err := tx.Prepare(`
INSERT INTO Hashes (hash,kind) VALUES (?,?) ON CONFLICT DO UPDATE SET hash=?1 RETURNING hashid
`)
if err != nil {
panic(err)
}
rows, err := tx.Query(`
INSERT INTO IDs (domain,id) VALUES (?,?) ON CONFLICT DO UPDATE SET domain=?1 RETURNING idid
`, hash.ID.Domain, hash.ID.ID)
if err != nil {
panic(err)
}
if !rows.Next() {
panic("Unable to insert IDs")
}
var id_id int64
err = rows.Scan(&id_id)
if err != nil {
panic(err)
}
rows.Close()
hash_ids := []int64{}
for _, hash := range hash.Hashes {
rows, err := insertHashes.Query(int64(hash.Hash), hash.Kind)
if err != nil {
panic(err)
}
if !rows.Next() {
panic("Unable to insert IDs")
}
var id int64
err = rows.Scan(&id)
rows.Close()
if err != nil {
panic(err)
}
hash_ids = append(hash_ids, id)
}
var ids []any
for _, hash_id := range hash_ids {
ids = append(ids, hash_id, id_id)
}
_, err = tx.Exec(`INSERT INTO id_hash (hashid,idid) VALUES `+strings.TrimSuffix(strings.Repeat("(?, ?),", len(hash_ids)), ",")+` ON CONFLICT DO NOTHING;`, ids...)
if err != nil {
panic(fmt.Errorf("Failed inserting: %v,%v: %w", hash.ID.Domain, hash.ID.ID, err))
}
err = tx.Commit()
if err != nil {
panic(err)
}
insertHashes.Close()
}
func (s *sqliteStorage) DecodeHashes(hashes SavedHashes) error {
err := s.dropIndexes()
if err != nil {
return err
}
for hashType, sourceHashes := range hashes.Hashes {
hashKind := goimagehash.Kind(hashType + 1)
for hash, idsLocations := range sourceHashes {
for _, id := range hashes.IDs[idsLocations] {
s.MapHashes(ImageHash{
Hashes: []Hash{{hash, hashKind}},
ID: id,
})
}
}
}
err = s.createIndexes()
if err != nil {
return err
}
return nil
}
func (s *sqliteStorage) EncodeHashes() (SavedHashes, error) {
hashes := SavedHashes{}
conn, err := s.db.Conn(context.Background())
if err != nil {
return hashes, err
}
defer conn.Close()
rows, err := conn.QueryContext(context.Background(), "SELECT IDs.domain,IDs.id,Hashes.hash,Hashes.kind FROM Hashes JOIN id_hash ON id_hash.hashid = hashes.rowid JOIN IDs ON IDs.rowid = id_hash.idid ORDER BY IDs.ID,Hashes.kind,Hashes.hash;")
if err != nil {
rows.Close()
return hashes, err
}
var (
id ID
hash Hash
)
err = rows.Scan(&id.Domain, &id.ID, &hash.Hash, &hash.Kind)
if err != nil {
return hashes, err
}
hashes.InsertHash(hash, id)
return hashes, nil
}
func (s *sqliteStorage) AssociateIDs(newIDs []NewIDs) {
for _, ids := range newIDs {
var oldIDID, newIDID int
_, err := s.db.Exec(`INSERT INTO IDs domain,id VALUES (?,?)`, ids.NewID.Domain, ids.NewID.ID)
if err != nil {
panic(err)
}
rows, err := s.db.Query(`SELECT idid FROM IDs WHERE domain=? AND id=?`, ids.NewID.Domain, ids.NewID.ID)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
panic(err)
}
if rows.Next() {
rows.Scan(&newIDID)
} else {
panic("Unable to insert New ID into database")
}
rows.Close()
rows, err = s.db.Query(`SELECT idid FROM IDs WHERE domain=? AND id=?`, ids.OldID.Domain, ids.OldID.ID)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
panic(err)
}
if rows.Next() {
rows.Scan(&oldIDID)
} else {
continue
}
_, err = s.db.Exec(`INSERT INTO id_hash (hashid, id_id) SELECT hashid,? FROM id_hash where id_id=?`, newIDID, oldIDID)
if err != nil {
panic(err)
}
}
}
func (s *sqliteStorage) GetIDs(id ID) IDList {
var idid int
rows, err := s.db.Query(`SELECT idid FROM IDs WHERE domain=? AND id=?`, id.Domain, id.ID)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
panic(err)
}
if rows.Next() {
rows.Scan(&idid)
} else {
return nil
}
rows, err = s.db.Query(`SELECT id_hash FROM id_hash WHERE id_id=?`, idid)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
panic(err)
}
var hashIDs []interface{}
for rows.Next() {
var hashID int
rows.Scan(&hashID)
hashIDs = append(hashIDs, hashID)
}
rows.Close()
IDs := make(IDList)
rows, err = s.db.Query(`SELECT IDs.domain,IDs.id FROM id_hash JOIN IDs ON id_hash.idid==IDs.idid WHERE hash_id in (`+strings.TrimRight(strings.Repeat("?,", len(hashIDs)), ",")+`)`, hashIDs...)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
panic(err)
}
for rows.Next() {
var id ID
rows.Scan(&id.Domain, id.ID)
IDs[id.Domain] = append(IDs[id.Domain], id.ID)
}
return IDs
}
func NewSqliteStorage(db, path string) (HashStorage, error) {
sqlite := &sqliteStorage{}
sqlDB, err := sql.Open(db, fmt.Sprintf("file://%s?_pragma=cache_size(-200000)&_pragma=busy_timeout(500)&_pragma=hard_heap_limit(1073741824)&_pragma=journal_mode(wal)&_pragma=soft_heap_limit(314572800)", path))
if err != nil {
panic(err)
}
sqlite.db = sqlDB
_, err = sqlite.db.Exec(`
PRAGMA foreign_keys=ON;
CREATE TABLE IF NOT EXISTS Hashes(
hashid INTEGER PRIMARY KEY,
hash INT NOT NULL,
kind int NOT NULL,
UNIQUE(kind, hash)
);
CREATE TABLE IF NOT EXISTS IDs(
id TEXT NOT NULL,
domain TEXT NOT NULL,
idid INTEGER PRIMARY KEY,
UNIQUE (domain, id)
);
CREATE INDEX IF NOT EXISTS id_domain ON IDs (domain, id);
CREATE TABLE IF NOT EXISTS id_hash(
hashid INTEGER,
idid INTEGER,
FOREIGN KEY(hashid) REFERENCES Hashes(hashid),
FOREIGN KEY(idid) REFERENCES IDs(idid)
UNIQUE (hashid, idid)
);
`)
if err != nil {
panic(err)
}
sqlite.createIndexes()
sqlite.db.SetMaxOpenConns(1)
return sqlite, nil
}

7
sqlite_cgo.go Normal file
View File

@ -0,0 +1,7 @@
//go:build cgo
package ch
import (
_ "github.com/mattn/go-sqlite3"
)

8
sqlite_no_cgo.go Normal file
View File

@ -0,0 +1,8 @@
//go:build !cgo
package ch
import (
_ "github.com/ncruces/go-sqlite3/driver"
_ "github.com/ncruces/go-sqlite3/embed"
)

105
vp-tree.go Normal file
View File

@ -0,0 +1,105 @@
package ch
import (
"errors"
"fmt"
"math/bits"
"gitea.narnian.us/lordwelch/goimagehash"
"gonum.org/v1/gonum/spatial/vptree"
)
type VPTree struct {
trees [3]*vptree.Tree
hashes [3][]vptree.Comparable
}
type VPHash struct {
Hash Hash
IDs []ID
}
func (h *VPHash) Distance(c vptree.Comparable) float64 {
h2, ok := c.(*VPHash)
if !ok {
return -99
}
return float64(bits.OnesCount64(h.Hash.Hash ^ h2.Hash.Hash))
}
func (v *VPTree) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) {
var matches []Result
var exactMatches []Result
fmt.Println(hashes)
for _, hash := range hashes {
results := vptree.NewDistKeeper(float64(max))
hashType := int(hash.Kind) - 1
v.trees[hashType].NearestSet(results, &VPHash{Hash: hash})
for _, result := range results.Heap {
vphash := result.Comparable.(*VPHash)
if result.Dist == 0 {
exactMatches = append(exactMatches, Result{
IDs: ToIDList(vphash.IDs),
Distance: int(result.Dist),
Hash: vphash.Hash,
})
} else {
matches = append(matches, Result{
IDs: ToIDList(vphash.IDs),
Distance: int(result.Dist),
Hash: vphash.Hash,
})
}
}
}
if len(exactMatches) > 0 && exactOnly {
return exactMatches, nil
}
matches = append(exactMatches[:len(exactMatches):len(exactMatches)], matches...)
return matches, nil
}
func (v *VPTree) MapHashes(ImageHash) {
panic("Not Implemented")
}
func (v *VPTree) DecodeHashes(hashes SavedHashes) error {
var err error
for hashType, sourceHashes := range hashes.Hashes {
for hash, idsLocation := range sourceHashes {
var (
hashKind = goimagehash.Kind(hashType + 1)
)
hash := &VPHash{Hash{hash, hashKind}, hashes.IDs[idsLocation]}
v.hashes[hashType] = append(v.hashes[hashType], hash)
}
}
for hashType := range 3 {
v.trees[hashType], err = vptree.New(v.hashes[hashType], 3, nil)
if err != nil {
return err
}
}
return nil
}
func (v *VPTree) EncodeHashes() (SavedHashes, error) {
return SavedHashes{}, errors.New("Not Implemented")
}
func (v *VPTree) AssociateIDs(newIDs []NewIDs) {
panic("Not Implemented")
}
func (v *VPTree) GetIDs(id ID) IDList {
return nil
}
func NewVPStorage() (HashStorage, error) {
return &VPTree{
hashes: [3][]vptree.Comparable{
make([]vptree.Comparable, 0, 1_000_000),
make([]vptree.Comparable, 0, 1_000_000),
make([]vptree.Comparable, 0, 1_000_000),
},
}, nil
}