diff --git a/cmd/comic-hasher/main.go b/cmd/comic-hasher/main.go index 4e55ec4..2a3ffe7 100644 --- a/cmd/comic-hasher/main.go +++ b/cmd/comic-hasher/main.go @@ -29,9 +29,10 @@ import ( "sync" "time" + "github.com/kr/pretty" + "github.com/vmihailenco/msgpack/v5" - "github.com/disintegration/imaging" _ "golang.org/x/image/tiff" _ "golang.org/x/image/vp8" _ "golang.org/x/image/vp8l" @@ -39,37 +40,20 @@ import ( ch "gitea.narnian.us/lordwelch/comic-hasher" "gitea.narnian.us/lordwelch/goimagehash" - // "github.com/google/uuid" - // "github.com/zitadel/oidc/pkg/client/rp" - // httphelper "github.com/zitadel/oidc/pkg/http" - // "github.com/zitadel/oidc/pkg/oidc" ) type Server struct { - httpServer *http.Server - mux *http.ServeMux - BaseURL *url.URL - // token chan<- *oidc.Tokens - // Partial hashes are a uint64 split into 8 pieces or a unint64 for quick lookup, the value is an index to covers - PartialAhash [8]map[uint8][]uint64 - PartialDhash [8]map[uint8][]uint64 - PartialPhash [8]map[uint8][]uint64 - FullAhash map[uint64][]string // Maps ahash's to lists of ID's domain:id - FullDhash map[uint64][]string // Maps dhash's to lists of ID's domain:id - FullPhash map[uint64][]string // Maps phash's to lists of ID's domain:id - ids map[ch.Source]map[string]struct{} - hashMutex sync.RWMutex + httpServer *http.Server + mux *http.ServeMux + BaseURL *url.URL + hashes ch.HashStorage quit chan struct{} signalQueue chan os.Signal readerQueue chan string hashingQueue chan ch.Im - mappingQueue chan ch.Hash + mappingQueue chan ch.ImageHash } -// var key = []byte(uuid.New().String())[:16] - -type savedHashes map[ch.Source]map[string][3]uint64 - type Format int const ( @@ -135,6 +119,8 @@ func main() { panic(err) } } + opts.sqlitePath, _ = filepath.Abs(opts.sqlitePath) + pretty.Logln(opts) startServer(opts) } @@ -224,25 +210,25 @@ func (s *Server) associateIDs(w http.ResponseWriter, r *http.Request) { writeJson(w, http.StatusBadRequest, result{Msg: msg}) return } - if _, domainExists := s.ids[ch.Source(domain)]; !domainExists { - msg := "No IDs belonging to " + domain + "exist on this server" - log.Println(msg) - writeJson(w, http.StatusBadRequest, result{Msg: msg}) - } + // if _, domainExists := s.ids[ch.Source(domain)]; !domainExists { + // msg := "No IDs belonging to " + domain + "exist on this server" + // log.Println(msg) + // writeJson(w, http.StatusBadRequest, result{Msg: msg}) + // } log.Printf("Attempting to associate %s:%s to %s:%s", domain, ID, newDomain, newID) found := false - for _, hash := range []map[uint64][]string{s.FullAhash, s.FullDhash, s.FullPhash} { - for i, idlist := range hash { - if _, found_in_hash := slices.BinarySearch(idlist, domain+":"+ID); found_in_hash { - found = true - hash[i] = ch.Insert(idlist, newDomain+":"+newID) - if _, ok := s.ids[ch.Source(newDomain)]; !ok { - s.ids[ch.Source(newDomain)] = make(map[string]struct{}) - } - s.ids[ch.Source(newDomain)][newID] = struct{}{} - } - } - } + // for _, hash := range []map[uint64][]string{s.FullAhash, s.FullDhash, s.FullPhash} { + // for i, idlist := range hash { + // if _, found_in_hash := slices.BinarySearch(idlist, domain+":"+ID); found_in_hash { + // found = true + // hash[i] = ch.Insert(idlist, newDomain+":"+newID) + // if _, ok := s.ids[ch.Source(newDomain)]; !ok { + // s.ids[ch.Source(newDomain)] = make(map[string]struct{}) + // } + // s.ids[ch.Source(newDomain)][newID] = struct{}{} + // } + // } + // } if found { writeJson(w, http.StatusOK, result{Msg: "New ID added"}) } else { @@ -250,70 +236,6 @@ func (s *Server) associateIDs(w http.ResponseWriter, r *http.Request) { } } -func (s *Server) getMatches(ahash, dhash, phash uint64, max int, skipNonExact bool) []ch.Result { - var foundMatches []ch.Result - s.hashMutex.RLock() - defer s.hashMutex.RUnlock() - - if skipNonExact { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate - if matchedResults, ok := s.FullAhash[ahash]; ok && ahash != 0 { - foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: 0, Hash: ch.ImageHash{Hash: ahash, Kind: goimagehash.AHash}}) - } - if matchedResults, ok := s.FullDhash[dhash]; ok && dhash != 0 { - foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: 0, Hash: ch.ImageHash{Hash: dhash, Kind: goimagehash.DHash}}) - } - if matchedResults, ok := s.FullPhash[phash]; ok && phash != 0 { - foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: 0, Hash: ch.ImageHash{Hash: phash, Kind: goimagehash.PHash}}) - } - - // If we have exact matches don't bother with other matches - if len(foundMatches) > 0 && skipNonExact { - return foundMatches - } - } - - foundHashes := make(map[uint64]struct{}) - if ahash != 0 { - for i, partialHash := range ch.SplitHash(ahash) { - for _, match := range ch.Atleast(max, ahash, s.PartialAhash[i][partialHash]) { - _, alreadyMatched := foundHashes[match.Hash] - if matchedResults, ok := s.FullAhash[match.Hash]; ok && !alreadyMatched { - foundHashes[match.Hash] = struct{}{} - foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: match.Distance, Hash: ch.ImageHash{Hash: match.Hash, Kind: goimagehash.AHash}}) - } - } - } - } - - foundHashes = make(map[uint64]struct{}) - if dhash != 0 { - for i, partialHash := range ch.SplitHash(dhash) { - for _, match := range ch.Atleast(max, dhash, s.PartialDhash[i][partialHash]) { - _, alreadyMatched := foundHashes[match.Hash] - if matchedResults, ok := s.FullDhash[match.Hash]; ok && !alreadyMatched { - foundHashes[match.Hash] = struct{}{} - foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: match.Distance, Hash: ch.ImageHash{Hash: match.Hash, Kind: goimagehash.DHash}}) - } - } - } - } - - foundHashes = make(map[uint64]struct{}) - if phash != 0 { - for i, partialHash := range ch.SplitHash(phash) { - for _, match := range ch.Atleast(max, phash, s.PartialPhash[i][partialHash]) { - _, alreadyMatched := foundHashes[match.Hash] - if matchedResults, ok := s.FullPhash[match.Hash]; ok && !alreadyMatched { - foundHashes[match.Hash] = struct{}{} - foundMatches = append(foundMatches, ch.Result{IDs: matchedResults, Distance: match.Distance, Hash: ch.ImageHash{Hash: match.Hash, Kind: goimagehash.PHash}}) - } - } - } - } - - return foundMatches -} - type SimpleResult struct { Distance int IDList ch.IDList @@ -323,67 +245,31 @@ func getSimpleResults(fullResults []ch.Result) []SimpleResult { simpleResult := make([]SimpleResult, 0, len(fullResults)) slices.SortFunc(fullResults, func(a, b ch.Result) int { - return cmp.Compare(a.Distance, b.Distance) + return cmp.Compare(a.Distance, b.Distance) * -1 // Reverses sort }) // Deduplicate IDs - idToDistance := make(map[string]int) + distance := make(map[int]SimpleResult) + for _, fullResult := range fullResults { - for _, id := range fullResult.IDs { - if distance, ok := idToDistance[id]; !ok || fullResult.Distance < distance { - idToDistance[id] = fullResult.Distance + simple, ok := distance[fullResult.Distance] + if !ok { + simple.IDList = make(ch.IDList) + } + for source, ids := range fullResult.IDs { + for _, id := range ids { + simple.IDList[source] = ch.Insert(simple.IDList[source], id) } } } - // Group by distance - distanceMap := make(map[int]SimpleResult) - for id, distance := range idToDistance { - var ( - sr SimpleResult - ok bool - ) - if sr, ok = distanceMap[distance]; !ok { - sr.IDList = make(ch.IDList) - } - sourceID := strings.SplitN(id, ":", 2) - sr.Distance = distance - sr.IDList[ch.Source(sourceID[0])] = append(sr.IDList[ch.Source(sourceID[0])], sourceID[1]) - distanceMap[distance] = sr - } - // turn into array - for _, sr := range distanceMap { + for _, sr := range distance { simpleResult = append(simpleResult, sr) } return simpleResult } -type APIResult struct { - IDList ch.IDList - Distance int - Hash ch.ImageHash -} - -func getResults(fullResults []ch.Result) []APIResult { - apiResults := make([]APIResult, 0, len(fullResults)) - for _, res := range fullResults { - idlist := make(ch.IDList) - for _, id := range res.IDs { - sourceID := strings.SplitN(id, ":", 2) - idlist[ch.Source(sourceID[0])] = append(idlist[ch.Source(sourceID[0])], sourceID[1]) - } - apiResults = append(apiResults, - APIResult{ - Distance: res.Distance, - Hash: res.Hash, - IDList: idlist, - }, - ) - } - return apiResults -} - type result struct { Results any `json:"results,omitempty"` Msg string `json:"msg,omitempty"` @@ -411,19 +297,19 @@ func (s *Server) matchCoverHash(w http.ResponseWriter, r *http.Request) { return } var ( - values = r.URL.Query() - ahashStr = strings.TrimSpace(values.Get("ahash")) - dhashStr = strings.TrimSpace(values.Get("dhash")) - phashStr = strings.TrimSpace(values.Get("phash")) - maxStr = strings.TrimSpace(values.Get("max")) - skipNonExact = strings.ToLower(strings.TrimSpace(values.Get("skipNonExact"))) != "false" - simple = strings.ToLower(strings.TrimSpace(values.Get("simple"))) == "true" - ahash uint64 - dhash uint64 - phash uint64 - max int = 8 - max_tmp int - err error + values = r.URL.Query() + ahashStr = strings.TrimSpace(values.Get("ahash")) + dhashStr = strings.TrimSpace(values.Get("dhash")) + phashStr = strings.TrimSpace(values.Get("phash")) + maxStr = strings.TrimSpace(values.Get("max")) + exactOnly = strings.ToLower(strings.TrimSpace(values.Get("exactOnly"))) != "false" + simple = strings.ToLower(strings.TrimSpace(values.Get("simple"))) == "true" + ahash uint64 + dhash uint64 + phash uint64 + max int = 8 + max_tmp int + err error ) if ahash, err = strconv.ParseUint(ahashStr, 16, 64); err != nil && ahashStr != "" { @@ -455,13 +341,24 @@ func (s *Server) matchCoverHash(w http.ResponseWriter, r *http.Request) { writeJson(w, http.StatusBadRequest, result{Msg: fmt.Sprintf("Max must be less than 9: %d", max)}) return } - matches := s.getMatches(ahash, dhash, phash, max, skipNonExact) + matches, err := s.hashes.GetMatches([]ch.Hash{{ahash, goimagehash.AHash}, {dhash, goimagehash.DHash}, {phash, goimagehash.PHash}}, max, exactOnly) + log.Println(err) if len(matches) > 0 { + var msg string = "" + if err != nil { + msg = err.Error() + } if simple { - writeJson(w, http.StatusOK, result{Results: getSimpleResults(matches)}) + writeJson(w, http.StatusOK, result{ + Results: getSimpleResults(matches), + Msg: msg, + }) return } - writeJson(w, http.StatusOK, result{Results: getResults(matches)}) + writeJson(w, http.StatusOK, result{ + Results: matches, + Msg: msg, + }) return } @@ -503,69 +400,14 @@ func (s *Server) addCover(w http.ResponseWriter, r *http.Request) { return default: } - s.hashingQueue <- ch.Im{Im: i, Format: format, Domain: ch.Source(domain), ID: ID, Path: ""} + s.hashingQueue <- ch.Im{Im: i, Format: format, ID: ch.ID{Domain: ch.Source(domain), ID: ID}, Path: ""} writeJson(w, http.StatusOK, result{Msg: "Success"}) } -func (s *Server) MapHashes(hash ch.Hash) { - s.hashMutex.Lock() - defer s.hashMutex.Unlock() - s.mapHashes(hash.Ahash.GetHash(), hash.Dhash.GetHash(), hash.Phash.GetHash(), hash.Domain, hash.ID) -} - -func (s *Server) mapHashes(ahash, dhash, phash uint64, domain ch.Source, id string) { - - if _, ok := s.ids[domain]; !ok { - s.ids[domain] = make(map[string]struct{}) - } - s.ids[domain][id] = struct{}{} - - if _, ok := s.FullAhash[ahash]; !ok { - s.FullAhash[ahash] = make([]string, 0, 3) - } - s.FullAhash[ahash] = ch.Insert(s.FullAhash[ahash], string(domain)+":"+id) - - if _, ok := s.FullDhash[dhash]; !ok { - s.FullDhash[dhash] = make([]string, 0, 3) - } - s.FullDhash[dhash] = ch.Insert(s.FullDhash[dhash], string(domain)+":"+id) - - if _, ok := s.FullPhash[phash]; !ok { - s.FullPhash[phash] = make([]string, 0, 3) - } - s.FullPhash[phash] = ch.Insert(s.FullPhash[phash], string(domain)+":"+id) - - for i, partialHash := range ch.SplitHash(ahash) { - s.PartialAhash[i][partialHash] = append(s.PartialAhash[i][partialHash], ahash) - } - for i, partialHash := range ch.SplitHash(dhash) { - s.PartialDhash[i][partialHash] = append(s.PartialDhash[i][partialHash], dhash) - } - for i, partialHash := range ch.SplitHash(phash) { - s.PartialPhash[i][partialHash] = append(s.PartialPhash[i][partialHash], phash) - } -} - -func (s *Server) initHashes() { - for i := range s.PartialAhash { - s.PartialAhash[i] = make(map[uint8][]uint64) - } - for i := range s.PartialDhash { - s.PartialDhash[i] = make(map[uint8][]uint64) - } - for i := range s.PartialPhash { - s.PartialPhash[i] = make(map[uint8][]uint64) - } - s.FullAhash = make(map[uint64][]string) - s.FullDhash = make(map[uint64][]string) - s.FullPhash = make(map[uint64][]string) - s.ids = make(map[ch.Source]map[string]struct{}) -} - func (s *Server) mapper(done func()) { defer done() for hash := range s.mappingQueue { - s.MapHashes(hash) + s.hashes.MapHashes(hash) } } @@ -575,7 +417,7 @@ func (s *Server) hasher(workerID int, done func()) { start := time.Now() hash := ch.HashImage(image) - if hash.Domain == "" { + if hash.ID.Domain == "" || hash.ID.ID == "" { continue } @@ -588,7 +430,7 @@ func (s *Server) hasher(workerID int, done func()) { } elapsed := time.Since(start) - log.Printf("Hashing took %v: worker: %v. path: %s ahash: %064b id: %s\n", elapsed, workerID, image.Path, hash.Ahash.GetHash(), hash.ID) + log.Printf("Hashing took %v: worker: %v. path: %s %s: %064b id: %s\n", elapsed, workerID, image.Path, hash.Hashes[0].Kind, hash.Hashes[0].Hash, hash.ID) } } @@ -605,7 +447,11 @@ func (s *Server) reader(workerID int, done func()) { } file.Close() - im := ch.Im{Im: i, Format: format, Domain: ch.Source(filepath.Base(filepath.Dir(filepath.Dir(path)))), ID: filepath.Base(filepath.Dir(path)), Path: path} + im := ch.Im{ + Im: i, Format: format, + ID: ch.ID{Domain: ch.Source(filepath.Base(filepath.Dir(filepath.Dir(path)))), ID: filepath.Base(filepath.Dir(path))}, + Path: path, + } select { case <-s.quit: log.Println("Recieved quit") @@ -616,94 +462,43 @@ func (s *Server) reader(workerID int, done func()) { } } -func (s *Server) encodeHashes(e Encoder) ([]byte, error) { - hashes := make(savedHashes) - for source, ids := range s.ids { - hashes[source] = make(map[string][3]uint64, len(ids)) - } - for hash, idlist := range s.FullAhash { - for _, id := range idlist { - sourceID := strings.SplitN(id, ":", 2) - h := hashes[ch.Source(sourceID[0])][sourceID[1]] - h[0] = hash - hashes[ch.Source(sourceID[0])][sourceID[1]] = h - } - } - for hash, idlist := range s.FullDhash { - for _, id := range idlist { - sourceID := strings.SplitN(id, ":", 2) - h := hashes[ch.Source(sourceID[0])][sourceID[1]] - h[1] = hash - hashes[ch.Source(sourceID[0])][sourceID[1]] = h - } - - } - for hash, idlist := range s.FullPhash { - for _, id := range idlist { - sourceID := strings.SplitN(id, ":", 2) - h := hashes[ch.Source(sourceID[0])][sourceID[1]] - h[2] = hash - hashes[ch.Source(sourceID[0])][sourceID[1]] = h - } - - } - return e(hashes) -} - // EncodeHashes must have a lock to s.hashMutex func (s *Server) EncodeHashes(format Format) ([]byte, error) { + var encoder Encoder switch format { case Msgpack: - return s.encodeHashes(msgpack.Marshal) + encoder = msgpack.Marshal case JSON: - return s.encodeHashes(json.Marshal) - + encoder = json.Marshal default: return nil, fmt.Errorf("Unknown format: %v", format) } -} - -func (s *Server) decodeHashes(d Decoder, hashes []byte) error { - loadedHashes := make(savedHashes) - err := d(hashes, &loadedHashes) + hashes, err := s.hashes.EncodeHashes() if err != nil { - return err + return nil, err } - - for domain, ids := range loadedHashes { - for id := range ids { - if _, ok := s.ids[domain]; ok { - s.ids[domain][id] = struct{}{} - } else { - s.ids[domain] = make(map[string]struct{}) - } - } - } - for _, sourceHashes := range loadedHashes { - s.FullAhash = make(map[uint64][]string, len(sourceHashes)) - s.FullDhash = make(map[uint64][]string, len(sourceHashes)) - s.FullPhash = make(map[uint64][]string, len(sourceHashes)) - break - } - for domain, sourceHashes := range loadedHashes { - for id, h := range sourceHashes { - s.mapHashes(h[0], h[1], h[2], domain, id) - } - } - return nil + return encoder(hashes) } // DecodeHashes must have a lock to s.hashMutex func (s *Server) DecodeHashes(format Format, hashes []byte) error { + var decoder Decoder switch format { case Msgpack: - return s.decodeHashes(msgpack.Unmarshal, hashes) + decoder = msgpack.Unmarshal case JSON: - return s.decodeHashes(json.Unmarshal, hashes) + decoder = json.Unmarshal default: return fmt.Errorf("Unknown format: %v", format) } + loadedHashes := make(ch.SavedHashes) + err := decoder(hashes, &loadedHashes) + if err != nil { + return err + } + + return s.hashes.DecodeHashes(loadedHashes) } func (s *Server) HashLocalImages(opts Opts) { @@ -769,13 +564,13 @@ func startServer(opts Opts) { } mux := http.NewServeMux() + server := Server{ - // token: make(chan *oidc.Tokens), quit: make(chan struct{}), signalQueue: make(chan os.Signal, 1), - readerQueue: make(chan string, 1120130), // Number gotten from checking queue size + readerQueue: make(chan string, 100), hashingQueue: make(chan ch.Im), - mappingQueue: make(chan ch.Hash), + mappingQueue: make(chan ch.ImageHash), mux: mux, httpServer: &http.Server{ Addr: ":8080", @@ -786,12 +581,16 @@ func startServer(opts Opts) { }, } Notify(server.signalQueue) - imaging.SetMaxProcs(1) + var err error fmt.Println("init hashes") - server.initHashes() - // server.setupOauthHandlers() + server.hashes, err = ch.NewMapStorage() + if err != nil { + panic(err) + } + fmt.Println("init handlers") server.setupAppHandlers() + fmt.Println("init hashers") rwg := sync.WaitGroup{} for i := range 10 { @@ -829,7 +628,7 @@ func startServer(opts Opts) { if err != nil { panic(fmt.Sprintf("Failed to decode embedded hashes: %s", err)) } - fmt.Printf("Loaded embedded %s hashes ahashes: %d dhashes: %d phashes: %d\n", format, len(server.FullAhash), len(server.FullDhash), len(server.FullPhash)) + fmt.Printf("Loaded embedded %s hashes\n", format) } else { if f, err := os.Open(opts.hashesPath); err == nil { var buf io.Reader = f @@ -854,7 +653,7 @@ func startServer(opts Opts) { if err != nil { panic(fmt.Sprintf("Failed to decode hashes from disk: %s", err)) } - fmt.Printf("Loaded hashes from %q %s hashes ahashes: %d dhashes: %d phashes: %d\n", opts.hashesPath, format, len(server.FullAhash), len(server.FullDhash), len(server.FullPhash)) + fmt.Printf("Loaded hashes from %q %s\n", opts.hashesPath, format) } else { if errors.Is(err, os.ErrNotExist) { fmt.Println("No saved hashes to load") @@ -867,7 +666,7 @@ func startServer(opts Opts) { server.HashLocalImages(opts) fmt.Println("Listening on ", server.httpServer.Addr) - err := server.httpServer.ListenAndServe() + err = server.httpServer.ListenAndServe() if err != nil { fmt.Println(err) } diff --git a/cmd/hash/main.go b/cmd/hash/main.go index fde3027..80df25b 100644 --- a/cmd/hash/main.go +++ b/cmd/hash/main.go @@ -106,9 +106,9 @@ func main() { debugImage(debugim, 8, 8) } - hash := ch.HashImage(ch.Im{Im: im, Format: format, Domain: ch.Source(ch.ComicVine), ID: "nothing"}) + hash := ch.HashImage(ch.Im{Im: im, Format: format, ID: ch.ID{Domain: ch.Source(ch.ComicVine), ID: "nothing"}}) - fmt.Println("ahash: ", hash.Ahash.BinString()) - fmt.Println("dhash: ", hash.Dhash.BinString()) - fmt.Println("phash: ", hash.Phash.BinString()) + fmt.Println("ahash: ", goimagehash.NewImageHash(hash.Hashes[0].Hash, hash.Hashes[0].Kind).BinString()) + fmt.Println("dhash: ", goimagehash.NewImageHash(hash.Hashes[1].Hash, hash.Hashes[1].Kind).BinString()) + fmt.Println("phash: ", goimagehash.NewImageHash(hash.Hashes[2].Hash, hash.Hashes[2].Kind).BinString()) } diff --git a/go.mod b/go.mod index d035163..145d914 100644 --- a/go.mod +++ b/go.mod @@ -6,8 +6,8 @@ toolchain go1.22.2 require ( gitea.narnian.us/lordwelch/goimagehash v0.0.0-20240812025715-33ff96e45f00 - github.com/disintegration/imaging v1.6.3-0.20201218193011-d40f48ce0f09 github.com/fmartingr/go-comicinfo/v2 v2.0.2 + github.com/kr/pretty v0.1.0 github.com/mholt/archiver/v4 v4.0.0-alpha.8 golang.org/x/image v0.19.0 golang.org/x/text v0.17.0 @@ -24,6 +24,7 @@ require ( github.com/bodgit/sevenzip v1.3.0 // indirect github.com/bodgit/windows v1.0.0 // indirect github.com/connesc/cipherio v0.2.1 // indirect + github.com/disintegration/imaging v1.6.3-0.20201218193011-d40f48ce0f09 // indirect github.com/dsnet/compress v0.0.1 // indirect github.com/golang/mock v1.6.0 // indirect github.com/golang/snappy v0.0.4 // indirect @@ -31,6 +32,7 @@ require ( github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/klauspost/compress v1.15.9 // indirect github.com/klauspost/pgzip v1.2.5 // indirect + github.com/kr/text v0.1.0 // indirect github.com/nwaples/rardecode/v2 v2.0.0-beta.2 // indirect github.com/pierrec/lz4/v4 v4.1.15 // indirect github.com/therootcompany/xz v1.0.1 // indirect diff --git a/go.sum b/go.sum index 46c0491..4535962 100644 --- a/go.sum +++ b/go.sum @@ -94,8 +94,10 @@ github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHU github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE= github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/lordwelch/text v0.0.0-20240505231825-4893f344170f h1:RMKTfrT4gjJfmB/aWuvCcFxUSvWAJfOAc5khGL6ASjk= github.com/lordwelch/text v0.0.0-20240505231825-4893f344170f/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= diff --git a/hashing.go b/hashing.go index de74e42..f8551c5 100644 --- a/hashing.go +++ b/hashing.go @@ -50,35 +50,49 @@ type Match struct { } type ID struct { - Domain, ID string -} - -type Result struct { - IDs []string // domain:id - Distance int - Hash ImageHash -} - -type Im struct { - Im image.Image - Format string - Domain Source - ID, Path string -} - -type Hash struct { - Ahash *goimagehash.ImageHash - Dhash *goimagehash.ImageHash - Phash *goimagehash.ImageHash Domain Source ID string } +type Result struct { + IDs IDList + Distance int + Hash Hash +} + +type Im struct { + Im image.Image + Format string + Path string + ID ID +} + type ImageHash struct { + Hashes []Hash + ID ID +} + +type Hash struct { Hash uint64 Kind goimagehash.Kind } +type SavedHashes map[Source]map[string][3]uint64 + +type NewIDs struct { + OldID ID + NewID ID +} + +type HashStorage interface { + GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) + MapHashes(ImageHash) + DecodeHashes(hashes SavedHashes) error + EncodeHashes() (SavedHashes, error) + AssociateIDs(newIDs []NewIDs) + GetIDs(id ID) IDList +} + func Atleast(maxDistance int, searchHash uint64, hashes []uint64) []Match { matchingHashes := make([]Match, 0, len(hashes)/2) // hope that we don't need all of them for _, storedHash := range hashes { @@ -98,47 +112,49 @@ func Insert[S ~[]E, E cmp.Ordered](slice S, item E) S { return slices.Insert(slice, index, item) } +func InsertIdx[S ~[]E, E cmp.Ordered](slice S, item E) (S, int) { + index, itemFound := slices.BinarySearch(slice, item) + if itemFound { + return slice, index + } + return slices.Insert(slice, index, item), index +} + func MemStats() uint64 { var m runtime.MemStats runtime.ReadMemStats(&m) return m.Alloc } -func HashImage(i Im) Hash { +func HashImage(i Im) ImageHash { if i.Format == "webp" { i.Im = goimagehash.FancyUpscale(i.Im.(*image.YCbCr)) } var ( - err error = nil - ahash *goimagehash.ImageHash - dhash *goimagehash.ImageHash - phash *goimagehash.ImageHash + err error ) - ahash, err = goimagehash.AverageHash(i.Im) + ahash, err := goimagehash.AverageHash(i.Im) if err != nil { msg := fmt.Sprintf("Failed to ahash Image: %s", err) log.Println(msg) - return Hash{} + return ImageHash{} } - dhash, err = goimagehash.DifferenceHash(i.Im) + dhash, err := goimagehash.DifferenceHash(i.Im) if err != nil { msg := fmt.Sprintf("Failed to dhash Image: %s", err) log.Println(msg) - return Hash{} + return ImageHash{} } - phash, err = goimagehash.PerceptionHash(i.Im) + phash, err := goimagehash.PerceptionHash(i.Im) if err != nil { msg := fmt.Sprintf("Failed to phash Image: %s", err) log.Println(msg) - return Hash{} + return ImageHash{} } - return Hash{ - Ahash: ahash, - Dhash: dhash, - Phash: phash, - Domain: i.Domain, + return ImageHash{ + Hashes: []Hash{{ahash.GetHash(), ahash.GetKind()}, {dhash.GetHash(), dhash.GetKind()}, {phash.GetHash(), phash.GetKind()}}, ID: i.ID, } } diff --git a/map.go b/map.go new file mode 100644 index 0000000..7d4443a --- /dev/null +++ b/map.go @@ -0,0 +1,274 @@ +package ch + +import ( + "cmp" + "math/bits" + "slices" + "sync" + + "gitea.narnian.us/lordwelch/goimagehash" +) + +type mapStorage struct { + hashMutex sync.RWMutex + partialHash [3][8]map[uint8][]int + // partialAhash [8]map[uint8][]int + // partialDhash [8]map[uint8][]int + // partialPhash [8]map[uint8][]int + + ids []ID + + idToHash map[int][3][]int + + hashes [3][]uint64 + // ahashes []uint64 + // dhashes []uint64 + // phashes []uint64 + + hashToID [3]map[int][]int + // ahashToID map[int][]int + // dhashToID map[int][]int + // phashToID map[int][]int +} + +func (m *mapStorage) addID(id ID) int { + index, itemFound := slices.BinarySearchFunc(m.ids, id, func(existing, new ID) int { + return cmp.Or( + cmp.Compare(existing.Domain, new.Domain), + cmp.Compare(existing.ID, new.ID), + ) + }) + if itemFound { + return index + } + m.ids = slices.Insert(m.ids, index, id) + return index +} + +func (m *mapStorage) getID(id ID) (int, bool) { + return slices.BinarySearchFunc(m.ids, id, func(existing, new ID) int { + return cmp.Or( + cmp.Compare(existing.Domain, new.Domain), + cmp.Compare(existing.ID, new.ID), + ) + }) +} + +func (m *mapStorage) Atleast(hashKind goimagehash.Kind, maxDistance int, searchHash uint64, hashes []int) []Result { + hashType := int(hashKind) - 1 + matchingHashes := make([]Result, 0, len(hashes)/2) // hope that we don't need all of them + for _, idx := range hashes { + storedHash := m.hashes[hashType][idx] + distance := bits.OnesCount64(searchHash ^ storedHash) + if distance <= maxDistance { + ids := make(IDList) + for _, idLocation := range m.hashToID[hashType][idx] { + ids[m.ids[idLocation].Domain] = Insert(ids[m.ids[idLocation].Domain], m.ids[idLocation].ID) + } + matchingHashes = append(matchingHashes, Result{ids, distance, Hash{storedHash, hashKind}}) + } + } + return matchingHashes +} +func (m *mapStorage) GetMatches(hashes []Hash, max int, exactOnly bool) ([]Result, error) { + var foundMatches []Result + m.hashMutex.RLock() + defer m.hashMutex.RUnlock() + + if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate + for _, hash := range hashes { + hashType := int(hash.Kind) - 1 + if hashLocation, found := slices.BinarySearch(m.hashes[hashType], hash.Hash); found { + idlist := make(IDList) + for _, idLocation := range m.hashToID[hashType][hashLocation] { + + for _, hashLocation := range m.idToHash[idLocation][0] { + for _, foundIDLocation := range m.hashToID[hashType][hashLocation] { + foundID := m.ids[foundIDLocation] + idlist[foundID.Domain] = Insert(idlist[foundID.Domain], foundID.ID) + } + } + } + if len(idlist) > 0 { + foundMatches = append(foundMatches, Result{ + Distance: 0, + Hash: hash, + }) + } + } + } + + // If we have exact matches don't bother with other matches + if len(foundMatches) > 0 && exactOnly { + return foundMatches, nil + } + } + + foundHashes := make(map[uint64]struct{}) + for _, hash := range hashes { + if hash.Hash == 0 { + continue + } + hashType := int(hash.Kind) - 1 + for i, partialHash := range SplitHash(hash.Hash) { + for _, match := range m.Atleast(hash.Kind, max, hash.Hash, m.partialHash[hashType][i][partialHash]) { + _, alreadyMatched := foundHashes[match.Hash.Hash] + if alreadyMatched { + continue + } + foundMatches = append(foundMatches, match) + } + } + } + + return foundMatches, nil +} + +func (m *mapStorage) MapHashes(hash ImageHash) { + + idIndex := m.addID(hash.ID) + idHashes := m.idToHash[idIndex] + for _, hash := range hash.Hashes { + var ( + hashIndex int + hashType = int(hash.Kind) - 1 + ) + m.hashes[hashType], hashIndex = InsertIdx(m.hashes[hashType], hash.Hash) + for i, partialHash := range SplitHash(hash.Hash) { + m.partialHash[hashType][i][partialHash] = append(m.partialHash[hashType][i][partialHash], hashIndex) + } + idHashes[hashType] = Insert(idHashes[hashType], hashIndex) + m.hashToID[hashType][hashIndex] = Insert(m.hashToID[hashType][hashIndex], idIndex) + } + m.idToHash[idIndex] = idHashes +} + +func (m *mapStorage) DecodeHashes(hashes SavedHashes) error { + + for _, sourceHashes := range hashes { + m.hashes[0] = make([]uint64, 0, len(sourceHashes)) + m.hashes[1] = make([]uint64, 0, len(sourceHashes)) + m.hashes[2] = make([]uint64, 0, len(sourceHashes)) + break + } + for domain, sourceHashes := range hashes { + for id, h := range sourceHashes { + m.ids = append(m.ids, ID{Domain: Source(domain), ID: id}) + + for _, hash := range []Hash{Hash{h[0], goimagehash.AHash}, Hash{h[1], goimagehash.DHash}, Hash{h[2], goimagehash.PHash}} { + var ( + hashType = int(hash.Kind) - 1 + ) + m.hashes[hashType] = append(m.hashes[hashType], hash.Hash) + } + } + } + slices.SortFunc(m.ids, func(existing, new ID) int { + return cmp.Or( + cmp.Compare(existing.Domain, new.Domain), + cmp.Compare(existing.ID, new.ID), + ) + }) + slices.Sort(m.hashes[0]) + slices.Sort(m.hashes[1]) + slices.Sort(m.hashes[2]) + for domain, sourceHashes := range hashes { + for id, h := range sourceHashes { + m.MapHashes(ImageHash{ + Hashes: []Hash{{h[0], goimagehash.AHash}, {h[1], goimagehash.DHash}, {h[2], goimagehash.PHash}}, + ID: ID{Domain: Source(domain), ID: id}, + }) + } + } + return nil +} + +func (m *mapStorage) EncodeHashes() (SavedHashes, error) { + hashes := make(SavedHashes) + for idLocation, hashLocation := range m.idToHash { + id := m.ids[idLocation] + _, ok := hashes[id.Domain] + if !ok { + hashes[id.Domain] = make(map[string][3]uint64) + } + // TODO: Add all hashes. Currently saved hashes does not allow multiple IDs for a single hash + hashes[id.Domain][id.ID] = [3]uint64{ + m.hashes[0][hashLocation[0][0]], + m.hashes[1][hashLocation[1][0]], + m.hashes[2][hashLocation[2][0]], + } + } + return hashes, nil +} + +func (m *mapStorage) AssociateIDs(newids []NewIDs) { + for _, ids := range newids { + oldIDLocation, found := m.getID(ids.OldID) + if !found { + msg := "No IDs belonging to " + ids.OldID.Domain + "exist on this server" + panic(msg) + } + + newIDLocation := m.addID(ids.NewID) + + for _, hashType := range []int{int(goimagehash.AHash), int(goimagehash.DHash), int(goimagehash.PHash)} { + for _, hashLocation := range m.idToHash[oldIDLocation][hashType] { + m.hashToID[hashType][hashLocation] = Insert(m.hashToID[hashType][hashLocation], newIDLocation) + idHashes := m.idToHash[newIDLocation] + idHashes[hashType] = Insert(idHashes[hashType], hashLocation) + m.idToHash[newIDLocation] = idHashes + } + } + } +} + +func (m *mapStorage) GetIDs(id ID) IDList { + idIndex, found := m.getID(id) + if !found { + msg := "No IDs belonging to " + id.Domain + "exist on this server" + panic(msg) + } + ids := make(IDList) + + for _, hashLocation := range m.idToHash[idIndex][0] { + for _, foundIDLocation := range m.hashToID[0][hashLocation] { + foundID := m.ids[foundIDLocation] + ids[foundID.Domain] = Insert(ids[foundID.Domain], foundID.ID) + } + } + for _, hashLocation := range m.idToHash[idIndex][1] { + for _, foundIDLocation := range m.hashToID[1][hashLocation] { + foundID := m.ids[foundIDLocation] + ids[foundID.Domain] = Insert(ids[foundID.Domain], foundID.ID) + } + } + for _, hashLocation := range m.idToHash[idIndex][2] { + for _, foundIDLocation := range m.hashToID[2][hashLocation] { + foundID := m.ids[foundIDLocation] + ids[foundID.Domain] = Insert(ids[foundID.Domain], foundID.ID) + } + } + return ids +} + +func NewMapStorage() (HashStorage, error) { + storage := &mapStorage{ + hashMutex: sync.RWMutex{}, + idToHash: make(map[int][3][]int), + hashToID: [3]map[int][]int{ + make(map[int][]int), + make(map[int][]int), + make(map[int][]int), + }, + } + for i := range storage.partialHash[0] { + storage.partialHash[0][i] = make(map[uint8][]int) + } + for i := range storage.partialHash[1] { + storage.partialHash[1][i] = make(map[uint8][]int) + } + for i := range storage.partialHash[2] { + storage.partialHash[2][i] = make(map[uint8][]int) + } + return storage, nil +}