2024-09-01 18:05:58 -07:00
package ch
import (
"context"
"database/sql"
"errors"
"fmt"
"log"
"math/bits"
"strings"
2024-09-07 14:51:18 -07:00
"time"
2024-09-01 18:05:58 -07:00
"gitea.narnian.us/lordwelch/goimagehash"
_ "modernc.org/sqlite"
)
type sqliteStorage struct {
db * sql . DB
}
type sqliteHash struct {
hashid int
Result
}
func ( s * sqliteStorage ) findExactHashes ( statement * sql . Stmt , items ... interface { } ) ( [ ] sqliteHash , error ) { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
hashes := [ ] sqliteHash { }
rows , err := statement . Query ( items ... )
if err != nil {
return hashes , err
}
for rows . Next ( ) {
var (
r = sqliteHash { Result : Result { IDs : make ( IDList ) } }
h int64
)
err = rows . Scan ( & r . hashid , & h , & r . Hash . Kind )
if err != nil {
rows . Close ( )
return hashes , err
}
r . Hash . Hash = uint64 ( h )
hashes = append ( hashes , r )
}
rows . Close ( )
statement , err = s . db . PrepareContext ( context . Background ( ) , ` SELECT IDS.domain, IDs.id FROM IDs JOIN id_hash ON IDs.rowid = id_hash.idid WHERE (id_hash.hashid=?) ORDER BY IDs.domain, IDs.ID; ` )
if err != nil {
return hashes , err
}
for _ , hash := range hashes {
rows , err := statement . Query ( hash . hashid )
if err != nil {
return hashes , err
}
for rows . Next ( ) {
var source Source
var id string
err := rows . Scan ( & source , & id )
if err != nil {
return hashes , err
}
hash . IDs [ source ] = append ( hash . IDs [ source ] , id )
}
rows . Close ( )
}
return hashes , nil
}
func ( s * sqliteStorage ) findPartialHashes ( max int , search_hash int64 , kind goimagehash . Kind ) ( [ ] sqliteHash , error ) { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
hashes := [ ] sqliteHash { }
2024-09-07 14:51:18 -07:00
statement , err := s . db . PrepareContext ( context . Background ( ) , ` SELECT rowid,hash,kind FROM Hashes WHERE (kind=?) AND (((hash >> (0 * 8) & 0xFF)=(?2 >> (0 * 8) & 0xFF)) OR ((hash >> (1 * 8) & 0xFF)=(?2 >> (1 * 8) & 0xFF)) OR ((hash >> (2 * 8) & 0xFF)=(?2 >> (2 * 8) & 0xFF)) OR ((hash >> (3 * 8) & 0xFF)=(?2 >> (3 * 8) & 0xFF)) OR ((hash >> (4 * 8) & 0xFF)=(?2 >> (4 * 8) & 0xFF)) OR ((hash >> (5 * 8) & 0xFF)=(?2 >> (5 * 8) & 0xFF)) OR ((hash >> (6 * 8) & 0xFF)=(?2 >> (6 * 8) & 0xFF)) OR ((hash >> (7 * 8) & 0xFF)=(?2 >> (7 * 8) & 0xFF))); ` )
2024-09-01 18:05:58 -07:00
if err != nil {
return hashes , err
}
2024-09-07 14:51:18 -07:00
rows , err := statement . Query ( kind , int64 ( search_hash ) )
2024-09-01 18:05:58 -07:00
if err != nil {
return hashes , err
}
for rows . Next ( ) {
var (
r = sqliteHash { Result : Result { IDs : make ( IDList ) } }
h int64
)
err = rows . Scan ( & r . hashid , & h , & r . Hash . Kind )
if err != nil {
rows . Close ( )
return hashes , err
}
r . Hash . Hash = uint64 ( h )
r . Distance = bits . OnesCount64 ( uint64 ( search_hash ) ^ r . Hash . Hash )
if r . Distance <= max {
hashes = append ( hashes , r )
}
}
rows . Close ( )
2024-09-07 14:51:18 -07:00
logTime ( "Filter partial " + kind . String ( ) )
2024-09-01 18:05:58 -07:00
statement , err = s . db . PrepareContext ( context . Background ( ) , ` SELECT DISTINCT IDS.domain, IDs.id, id_hash.hashid FROM IDs JOIN id_hash ON IDs.rowid = id_hash.idid WHERE (id_hash.hashid in ( ` + strings . TrimRight ( strings . Repeat ( "?," , len ( hashes ) ) , "," ) + ` )) ORDER BY IDs.domain, IDs.ID; ` )
if err != nil {
return hashes , err
}
var ids [ ] any
for _ , hash := range hashes {
ids = append ( ids , hash . hashid )
}
rows , err = statement . Query ( ids ... )
if err != nil {
return hashes , err
}
for rows . Next ( ) {
var source Source
var id string
var hashid int
err := rows . Scan ( & source , & id , & hashid )
if err != nil {
return hashes , err
}
for _ , hash := range hashes {
if hash . hashid == hashid {
hash . IDs [ source ] = append ( hash . IDs [ source ] , id )
}
}
}
rows . Close ( )
return hashes , nil
}
func ( s * sqliteStorage ) dropIndexes ( ) error {
_ , err := s . db . Exec ( `
2024-09-02 15:35:36 -07:00
DROP INDEX IF EXISTS hash_index ;
DROP INDEX IF EXISTS hash_1_index ;
DROP INDEX IF EXISTS hash_2_index ;
DROP INDEX IF EXISTS hash_3_index ;
DROP INDEX IF EXISTS hash_4_index ;
DROP INDEX IF EXISTS hash_5_index ;
DROP INDEX IF EXISTS hash_6_index ;
DROP INDEX IF EXISTS hash_7_index ;
DROP INDEX IF EXISTS hash_8_index ;
2024-09-01 18:05:58 -07:00
2024-09-02 15:35:36 -07:00
DROP INDEX IF EXISTS id_domain ;
` )
2024-09-01 18:05:58 -07:00
if err != nil {
return err
}
return nil
}
func ( s * sqliteStorage ) createIndexes ( ) error {
_ , err := s . db . Exec ( `
CREATE INDEX IF NOT EXISTS hash_index ON Hashes ( kind , hash ) ;
CREATE INDEX IF NOT EXISTS hash_1_index ON Hashes ( ( hash >> ( 0 * 8 ) & 0xFF ) ) ;
CREATE INDEX IF NOT EXISTS hash_2_index ON Hashes ( ( hash >> ( 1 * 8 ) & 0xFF ) ) ;
CREATE INDEX IF NOT EXISTS hash_3_index ON Hashes ( ( hash >> ( 2 * 8 ) & 0xFF ) ) ;
CREATE INDEX IF NOT EXISTS hash_4_index ON Hashes ( ( hash >> ( 3 * 8 ) & 0xFF ) ) ;
CREATE INDEX IF NOT EXISTS hash_5_index ON Hashes ( ( hash >> ( 4 * 8 ) & 0xFF ) ) ;
CREATE INDEX IF NOT EXISTS hash_6_index ON Hashes ( ( hash >> ( 5 * 8 ) & 0xFF ) ) ;
CREATE INDEX IF NOT EXISTS hash_7_index ON Hashes ( ( hash >> ( 6 * 8 ) & 0xFF ) ) ;
CREATE INDEX IF NOT EXISTS hash_8_index ON Hashes ( ( hash >> ( 7 * 8 ) & 0xFF ) ) ;
CREATE INDEX IF NOT EXISTS id_domain ON IDs ( domain , id ) ;
PRAGMA shrink_memory ;
2024-09-07 14:51:18 -07:00
ANALYZE ;
2024-09-01 18:05:58 -07:00
` )
if err != nil {
return err
}
return nil
}
2024-09-07 14:51:18 -07:00
var (
total time . Duration
t = time . Now ( )
)
func resetTime ( ) {
total = 0
t = time . Now ( )
}
func logTime ( log string ) {
n := time . Now ( )
s := n . Sub ( t )
t = n
total += s
fmt . Printf ( "total: %v, %s: %v\n" , total , log , s )
}
2024-09-01 18:05:58 -07:00
func ( s * sqliteStorage ) GetMatches ( hashes [ ] Hash , max int , exactOnly bool ) ( [ ] Result , error ) {
2024-09-07 14:51:18 -07:00
var (
foundMatches [ ] Result
)
resetTime ( )
2024-09-01 18:05:58 -07:00
if exactOnly { // exact matches are also found by partial matches. Don't bother with exact matches so we don't have to de-duplicate
statement , err := s . db . Prepare ( ` SELECT rowid,hash,kind FROM Hashes WHERE ` + strings . TrimSuffix ( strings . Repeat ( "(hash=? AND kind=?) OR" , len ( hashes ) ) , "OR" ) + ` ORDER BY kind,hash; ` )
if err != nil {
2024-09-07 14:51:18 -07:00
logTime ( "Fail exact" )
2024-09-01 18:05:58 -07:00
return foundMatches , err
}
2024-09-07 14:51:18 -07:00
2024-09-01 18:05:58 -07:00
args := make ( [ ] interface { } , 0 , len ( hashes ) * 2 )
for _ , hash := range hashes {
if hash . Hash != 0 {
args = append ( args , int64 ( hash . Hash ) , hash . Kind )
}
}
hashes , err := s . findExactHashes ( statement , args ... )
if err != nil {
return foundMatches , err
}
for _ , hash := range hashes {
foundMatches = append ( foundMatches , hash . Result )
}
// If we have exact matches don't bother with other matches
if len ( foundMatches ) > 0 && exactOnly {
return foundMatches , nil
}
2024-09-07 14:51:18 -07:00
logTime ( "Search Exact" )
2024-09-01 18:05:58 -07:00
}
foundHashes := make ( map [ uint64 ] struct { } )
for _ , hash := range hashes {
hashes , err := s . findPartialHashes ( max , int64 ( hash . Hash ) , hash . Kind )
if err != nil {
return foundMatches , err
}
2024-09-07 14:51:18 -07:00
logTime ( "Search partial " + hash . Kind . String ( ) )
2024-09-01 18:05:58 -07:00
for _ , hash := range hashes {
if _ , alreadyMatched := foundHashes [ hash . Hash . Hash ] ; ! alreadyMatched {
foundHashes [ hash . Hash . Hash ] = struct { } { }
foundMatches = append ( foundMatches , hash . Result )
} else {
log . Println ( "Hash already found" , hash )
}
}
}
return foundMatches , nil
}
func ( s * sqliteStorage ) MapHashes ( hash ImageHash ) {
2024-09-07 14:51:18 -07:00
tx , err := s . db . BeginTx ( context . Background ( ) , nil )
if err != nil {
panic ( err )
}
insertHashes , err := tx . Prepare ( `
INSERT INTO Hashes ( hash , kind ) VALUES ( ? , ? ) ON CONFLICT DO UPDATE SET hash = ? 1 RETURNING hashid
2024-09-01 18:05:58 -07:00
` )
if err != nil {
panic ( err )
}
2024-09-07 14:51:18 -07:00
rows , err := tx . Query ( `
INSERT INTO IDs ( domain , id ) VALUES ( ? , ? ) ON CONFLICT DO UPDATE SET domain = ? 1 RETURNING idid
2024-09-02 15:35:36 -07:00
` , hash . ID . Domain , hash . ID . ID )
2024-09-01 18:05:58 -07:00
if err != nil {
panic ( err )
}
2024-09-02 15:35:36 -07:00
if ! rows . Next ( ) {
panic ( "Unable to insert IDs" )
}
var id_id int64
err = rows . Scan ( & id_id )
2024-09-01 18:05:58 -07:00
if err != nil {
panic ( err )
}
2024-09-02 15:35:36 -07:00
rows . Close ( )
2024-09-01 18:05:58 -07:00
hash_ids := [ ] int64 { }
for _ , hash := range hash . Hashes {
2024-09-02 15:35:36 -07:00
rows , err := insertHashes . Query ( int64 ( hash . Hash ) , hash . Kind )
2024-09-01 18:05:58 -07:00
if err != nil {
panic ( err )
}
2024-09-02 15:35:36 -07:00
if ! rows . Next ( ) {
panic ( "Unable to insert IDs" )
}
var id int64
err = rows . Scan ( & id )
rows . Close ( )
2024-09-01 18:05:58 -07:00
if err != nil {
panic ( err )
}
hash_ids = append ( hash_ids , id )
}
2024-09-07 14:51:18 -07:00
var ids [ ] any
2024-09-01 18:05:58 -07:00
for _ , hash_id := range hash_ids {
2024-09-07 14:51:18 -07:00
ids = append ( ids , hash_id , id_id )
}
_ , err = tx . Exec ( ` INSERT INTO id_hash (hashid,idid) VALUES ` + strings . TrimSuffix ( strings . Repeat ( "(?, ?)," , len ( hash_ids ) ) , "," ) + ` ON CONFLICT DO NOTHING; ` , ids ... )
if err != nil {
panic ( fmt . Errorf ( "Failed inserting: %v,%v: %w" , hash . ID . Domain , hash . ID . ID , err ) )
}
err = tx . Commit ( )
if err != nil {
panic ( err )
2024-09-01 18:05:58 -07:00
}
2024-09-07 14:51:18 -07:00
insertHashes . Close ( )
2024-09-01 18:05:58 -07:00
}
func ( s * sqliteStorage ) DecodeHashes ( hashes SavedHashes ) error {
err := s . dropIndexes ( )
if err != nil {
return err
}
2024-09-07 14:51:18 -07:00
for hashType , sourceHashes := range hashes . Hashes {
hashKind := goimagehash . Kind ( hashType + 1 )
for hash , idsLocations := range sourceHashes {
for _ , id := range hashes . IDs [ idsLocations ] {
s . MapHashes ( ImageHash {
Hashes : [ ] Hash { { hash , hashKind } } ,
ID : id ,
} )
}
2024-09-01 18:05:58 -07:00
}
}
err = s . createIndexes ( )
if err != nil {
return err
}
return nil
}
func ( s * sqliteStorage ) EncodeHashes ( ) ( SavedHashes , error ) {
2024-09-07 14:51:18 -07:00
hashes := SavedHashes { }
2024-09-01 18:05:58 -07:00
conn , err := s . db . Conn ( context . Background ( ) )
if err != nil {
return hashes , err
}
defer conn . Close ( )
2024-09-07 14:51:18 -07:00
rows , err := conn . QueryContext ( context . Background ( ) , "SELECT IDs.domain,IDs.id,Hashes.hash,Hashes.kind FROM Hashes JOIN id_hash ON id_hash.hashid = hashes.rowid JOIN IDs ON IDs.rowid = id_hash.idid ORDER BY IDs.ID,Hashes.kind,Hashes.hash;" )
2024-09-01 18:05:58 -07:00
if err != nil {
2024-09-07 14:51:18 -07:00
rows . Close ( )
2024-09-01 18:05:58 -07:00
return hashes , err
}
2024-09-07 14:51:18 -07:00
var (
id ID
hash Hash
)
err = rows . Scan ( & id . Domain , & id . ID , & hash . Hash , & hash . Kind )
if err != nil {
return hashes , err
2024-09-01 18:05:58 -07:00
}
2024-09-07 14:51:18 -07:00
hashes . InsertHash ( hash , id )
2024-09-01 18:05:58 -07:00
return hashes , nil
}
2024-10-14 02:02:26 -07:00
func ( s * sqliteStorage ) AssociateIDs ( newIDs [ ] NewIDs ) error {
2024-09-01 18:05:58 -07:00
for _ , ids := range newIDs {
var oldIDID , newIDID int
_ , err := s . db . Exec ( ` INSERT INTO IDs domain,id VALUES (?,?) ` , ids . NewID . Domain , ids . NewID . ID )
if err != nil {
2024-10-14 02:02:26 -07:00
return err
2024-09-01 18:05:58 -07:00
}
rows , err := s . db . Query ( ` SELECT idid FROM IDs WHERE domain=? AND id=? ` , ids . NewID . Domain , ids . NewID . ID )
if err != nil && ! errors . Is ( err , sql . ErrNoRows ) {
2024-10-14 02:02:26 -07:00
return err
2024-09-01 18:05:58 -07:00
}
if rows . Next ( ) {
rows . Scan ( & newIDID )
} else {
2024-10-14 02:02:26 -07:00
return errors . New ( "Unable to insert New ID into database" )
2024-09-01 18:05:58 -07:00
}
rows . Close ( )
rows , err = s . db . Query ( ` SELECT idid FROM IDs WHERE domain=? AND id=? ` , ids . OldID . Domain , ids . OldID . ID )
if err != nil && ! errors . Is ( err , sql . ErrNoRows ) {
2024-10-14 02:02:26 -07:00
return err
2024-09-01 18:05:58 -07:00
}
if rows . Next ( ) {
rows . Scan ( & oldIDID )
} else {
continue
}
_ , err = s . db . Exec ( ` INSERT INTO id_hash (hashid, id_id) SELECT hashid,? FROM id_hash where id_id=? ` , newIDID , oldIDID )
if err != nil {
2024-10-14 02:02:26 -07:00
return fmt . Errorf ( "Unable to associate IDs: %w" , err )
2024-09-01 18:05:58 -07:00
}
}
2024-10-14 02:02:26 -07:00
return nil
2024-09-01 18:05:58 -07:00
}
func ( s * sqliteStorage ) GetIDs ( id ID ) IDList {
var idid int
rows , err := s . db . Query ( ` SELECT idid FROM IDs WHERE domain=? AND id=? ` , id . Domain , id . ID )
if err != nil && ! errors . Is ( err , sql . ErrNoRows ) {
2024-10-14 02:02:26 -07:00
return nil
2024-09-01 18:05:58 -07:00
}
if rows . Next ( ) {
rows . Scan ( & idid )
} else {
return nil
}
rows , err = s . db . Query ( ` SELECT id_hash FROM id_hash WHERE id_id=? ` , idid )
if err != nil && ! errors . Is ( err , sql . ErrNoRows ) {
panic ( err )
}
var hashIDs [ ] interface { }
for rows . Next ( ) {
var hashID int
rows . Scan ( & hashID )
hashIDs = append ( hashIDs , hashID )
}
rows . Close ( )
IDs := make ( IDList )
rows , err = s . db . Query ( ` SELECT IDs.domain,IDs.id FROM id_hash JOIN IDs ON id_hash.idid==IDs.idid WHERE hash_id in ( ` + strings . TrimRight ( strings . Repeat ( "?," , len ( hashIDs ) ) , "," ) + ` ) ` , hashIDs ... )
if err != nil && ! errors . Is ( err , sql . ErrNoRows ) {
panic ( err )
}
for rows . Next ( ) {
var id ID
rows . Scan ( & id . Domain , id . ID )
IDs [ id . Domain ] = append ( IDs [ id . Domain ] , id . ID )
}
return IDs
}
func NewSqliteStorage ( db , path string ) ( HashStorage , error ) {
sqlite := & sqliteStorage { }
sqlDB , err := sql . Open ( db , fmt . Sprintf ( "file://%s?_pragma=cache_size(-200000)&_pragma=busy_timeout(500)&_pragma=hard_heap_limit(1073741824)&_pragma=journal_mode(wal)&_pragma=soft_heap_limit(314572800)" , path ) )
if err != nil {
panic ( err )
}
sqlite . db = sqlDB
_ , err = sqlite . db . Exec ( `
PRAGMA foreign_keys = ON ;
CREATE TABLE IF NOT EXISTS Hashes (
hashid INTEGER PRIMARY KEY ,
hash INT NOT NULL ,
kind int NOT NULL ,
UNIQUE ( kind , hash )
) ;
CREATE TABLE IF NOT EXISTS IDs (
id TEXT NOT NULL ,
domain TEXT NOT NULL ,
idid INTEGER PRIMARY KEY ,
UNIQUE ( domain , id )
) ;
CREATE INDEX IF NOT EXISTS id_domain ON IDs ( domain , id ) ;
CREATE TABLE IF NOT EXISTS id_hash (
hashid INTEGER ,
idid INTEGER ,
FOREIGN KEY ( hashid ) REFERENCES Hashes ( hashid ) ,
FOREIGN KEY ( idid ) REFERENCES IDs ( idid )
UNIQUE ( hashid , idid )
) ;
` )
if err != nil {
panic ( err )
}
2024-09-07 14:51:18 -07:00
sqlite . createIndexes ( )
2024-09-01 18:05:58 -07:00
sqlite . db . SetMaxOpenConns ( 1 )
return sqlite , nil
}