comic-hasher/cv/cv.go

648 lines
18 KiB
Go
Raw Normal View History

2024-10-14 02:03:37 -07:00
package cv
import (
"bufio"
2024-12-25 14:53:56 -08:00
"bytes"
2024-10-14 02:03:37 -07:00
"cmp"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"io/fs"
"log"
"net/http"
"net/url"
"os"
"path"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"slices"
2024-12-15 14:15:33 -08:00
ch "gitea.narnian.us/lordwelch/comic-hasher"
2024-10-14 02:03:37 -07:00
)
type Download struct {
URL string
Dest string
IssueID string
Image *bytes.Buffer
2024-10-14 02:03:37 -07:00
}
type Issue struct {
ID int `json:"id"`
Image struct {
IconURL string `json:"icon_url,omitempty"`
MediumURL string `json:"medium_url,omitempty"`
ScreenURL string `json:"screen_url,omitempty"`
ScreenLargeURL string `json:"screen_large_url,omitempty"`
SmallURL string `json:"small_url,omitempty"`
SuperURL string `json:"super_url,omitempty"`
ThumbURL string `json:"thumb_url"`
TinyURL string `json:"tiny_url,omitempty"`
OriginalURL string `json:"original_url"`
ImageTags string `json:"image_tags"`
} `json:"image"`
Volume struct {
ID int `json:"id"`
} `json:"volume"`
}
type CVResult struct {
// Error string `json:"error"`
// Limit int `json:"limit"`
Offset int `json:"offset"`
NumberOfPageResults int `json:"number_of_page_results"`
NumberOfTotalResults int `json:"number_of_total_results"`
StatusCode int `json:"status_code"`
Results []Issue `json:"results"`
// Version string `json:"version"`
}
type CVDownloader struct {
APIKey string
JSONPath string
ImagePath string
ImageTypes []string
SendExistingImages bool
2024-12-25 14:53:56 -08:00
KeepDownloadedImages bool
2024-10-14 02:03:37 -07:00
Context context.Context
FinishedDownloadQueue chan Download
2025-01-11 15:26:35 -08:00
fileList []string
2024-10-14 02:03:37 -07:00
totalResults int
imageWG sync.WaitGroup
downloadQueue chan *CVResult
imageDownloads chan download
notFound chan download
2024-12-15 14:15:33 -08:00
chdb ch.CHDB
bufPool *sync.Pool
2024-10-14 02:03:37 -07:00
}
2024-10-19 09:50:23 -07:00
var (
ErrQuit = errors.New("Quit")
ErrInvalidPage = errors.New("Invalid ComicVine Page")
)
2024-10-14 02:03:37 -07:00
func (c *CVDownloader) readJson() ([]*CVResult, error) {
var issues []*CVResult
2025-01-11 15:26:35 -08:00
for _, filename := range c.fileList {
2024-10-14 02:03:37 -07:00
if c.hasQuit() {
return nil, ErrQuit
}
2025-01-11 15:26:35 -08:00
result, err := c.loadIssues(filename)
2024-10-14 02:03:37 -07:00
if err != nil {
2024-10-19 09:50:23 -07:00
if err == ErrInvalidPage {
continue
}
2024-10-14 02:03:37 -07:00
return issues, err
}
2024-10-19 09:50:23 -07:00
c.totalResults = max(result.NumberOfTotalResults, c.totalResults)
2024-10-14 02:03:37 -07:00
issues = append(issues, result)
}
return issues, nil
}
2025-01-11 15:26:35 -08:00
func (c *CVDownloader) loadIssues(filename string) (*CVResult, error) {
tmp := &CVResult{Results: make([]Issue, 0, 100)}
2025-01-11 15:26:35 -08:00
file, err := os.Open(filepath.Join(c.JSONPath, filename))
2024-10-14 02:03:37 -07:00
if err != nil {
return nil, err
}
bytes, err := io.ReadAll(file)
if err != nil {
return nil, err
}
err = json.Unmarshal(bytes, tmp)
if err != nil {
return nil, err
}
2025-01-11 15:26:35 -08:00
if getOffset(filename) != tmp.Offset {
2024-10-19 09:50:23 -07:00
return nil, ErrInvalidPage
}
2024-10-14 02:03:37 -07:00
return tmp, nil
}
func Get(ctx context.Context, url string) (*http.Response, error, func()) {
ctx, cancel := context.WithTimeout(ctx, time.Second*10)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, err, cancel
}
resp, err := http.DefaultClient.Do(req)
return resp, err, cancel
}
2025-01-11 15:26:35 -08:00
func getOffset(name string) int {
i, _ := strconv.Atoi(name[3 : len(name)-1-4])
2024-10-14 02:03:37 -07:00
return i
}
// updateIssues c.downloadQueue must not be closed before this function has returned
func (c *CVDownloader) updateIssues() {
base_url, err := url.Parse("https://comicvine.gamespot.com/api/issues/?sort=date_added,id:asc&format=json&field_list=id,image,volume")
if err != nil {
log.Fatal(err)
}
query := base_url.Query()
query.Add("api_key", c.APIKey)
base_url.RawQuery = query.Encode()
c.totalResults = max(c.totalResults, 1)
failCount := 0
prev := -1
offset := 0
2024-10-18 14:30:30 -07:00
retry := func(url string, err error) bool {
2024-10-14 02:03:37 -07:00
if errors.Is(err, context.Canceled) {
log.Println("Server closed")
return false
}
2024-10-18 14:30:30 -07:00
log.Printf("Failed to download %#v at offset %v: %v Attempt #%d", url, offset, err, failCount+1)
2024-10-14 02:03:37 -07:00
if prev == offset {
sleepTime := time.Second * 36
if failCount > 2 {
sleepTime = time.Minute * 10
}
2024-10-19 09:39:08 -07:00
log.Println("This page failed to download, lets wait for", sleepTime, "and hope it works")
2024-10-14 02:03:37 -07:00
select {
case <-c.Context.Done(): // allows us to return immediately even during a timeout
return false
case <-time.After(sleepTime):
}
}
prev = offset
failCount += 1
offset -= 100
return failCount < 15
}
for offset = 0; offset < c.totalResults; offset += 100 {
2025-01-11 15:26:35 -08:00
index := offset / 100
2024-10-14 02:03:37 -07:00
if c.hasQuit() {
return
}
2025-01-11 15:26:35 -08:00
if index < len(c.fileList) {
if getOffset(c.fileList[index]) == offset { // If it's in order and it's not missing it should be here
if issue, err := c.loadIssues(c.fileList[index]); err == nil && issue != nil {
2024-10-14 02:03:37 -07:00
c.totalResults = max(c.totalResults, issue.NumberOfTotalResults)
prev = -1
failCount = 0
// When canceled one of these will randomly be chosen, c.downloadQueue won't be closed until after this function returns
select {
case <-c.Context.Done():
case c.downloadQueue <- issue:
}
continue
} else {
2025-01-11 15:26:35 -08:00
log.Println("Failed to read page at offset", offset, issue, err)
os.Remove(filepath.Join(c.JSONPath, c.fileList[index]))
c.fileList = slices.Delete(c.fileList, index, index+1)
2024-10-14 02:03:37 -07:00
}
2025-01-11 15:26:35 -08:00
} else {
log.Printf("Expected Offset %d got Offset %d", offset, getOffset(c.fileList[index]))
2024-10-14 02:03:37 -07:00
}
}
2025-01-11 15:26:35 -08:00
index, found := slices.BinarySearchFunc(c.fileList, offset, func(a string, b int) int {
return cmp.Compare(getOffset(a), b)
2024-10-14 02:03:37 -07:00
})
if found {
2025-01-11 15:26:35 -08:00
if issue, err := c.loadIssues(c.fileList[index]); err == nil && issue != nil {
2024-10-14 02:03:37 -07:00
prev = -1
failCount = 0
// When canceled one of these will randomly be chosen, c.downloadQueue won't be closed until after this function returns
select {
case <-c.Context.Done():
case c.downloadQueue <- issue:
}
continue
2024-10-19 09:50:23 -07:00
} else {
2025-01-11 15:26:35 -08:00
log.Println("Failed to read page at offset", offset, issue, err)
os.Remove(filepath.Join(c.JSONPath, c.fileList[index]))
2025-01-11 12:19:14 -08:00
c.fileList = slices.Delete(c.fileList, index, (index)+1)
2024-10-14 02:03:37 -07:00
}
}
log.Println("Starting download at offset", offset)
issue := &CVResult{}
2024-10-18 14:30:30 -07:00
URI := (*base_url)
2024-10-14 02:03:37 -07:00
query = base_url.Query()
query.Add("offset", strconv.Itoa(offset))
URI.RawQuery = query.Encode()
select {
case <-c.Context.Done(): // allows us to return immediately even during a timeout
return
case <-time.After(10 * time.Second):
}
resp, err, cancelDownloadCTX := Get(c.Context, URI.String())
if err != nil {
cancelDownloadCTX()
2024-10-18 14:30:30 -07:00
if retry(URI.String(), err) {
2024-10-14 02:03:37 -07:00
continue
}
return
}
2024-10-19 09:39:08 -07:00
if resp.StatusCode != 200 {
cancelDownloadCTX()
if retry(URI.String(), nil) {
2024-10-31 19:12:53 -07:00
_ = resp.Body.Close()
2024-10-19 09:39:08 -07:00
continue
}
log.Println("Failed to download this page, we'll wait for an hour to see if it clears up")
select {
case <-c.Context.Done(): // allows us to return immediately even during a timeout
2024-10-31 19:12:53 -07:00
_ = resp.Body.Close()
2024-10-19 09:39:08 -07:00
return
case <-time.After(1 * time.Hour):
}
}
2024-10-14 02:03:37 -07:00
file, err := os.Create(filepath.Join(c.JSONPath, "cv-"+strconv.Itoa(offset)+".json"))
if err != nil {
log.Fatal(err)
}
body := io.TeeReader(resp.Body, file)
err = json.NewDecoder(bufio.NewReader(body)).Decode(issue)
2024-10-31 19:12:53 -07:00
_ = resp.Body.Close()
_ = file.Close()
if err != nil || issue.Offset != offset {
os.Remove(filepath.Join(c.JSONPath, "cv-"+strconv.Itoa(offset)+".json"))
2024-10-14 02:03:37 -07:00
cancelDownloadCTX()
2024-10-18 14:30:30 -07:00
if retry(URI.String(), err) {
2024-10-14 02:03:37 -07:00
continue
}
return
}
cancelDownloadCTX()
if issue.NumberOfTotalResults > c.totalResults {
c.totalResults = issue.NumberOfTotalResults
}
prev = -1
failCount = 0
// When canceled one of these will randomly be chosen, c.downloadQueue won't be closed until after this function returns
select {
case <-c.Context.Done():
return
case c.downloadQueue <- issue:
}
log.Printf("Downloaded %s/cv-%v.json", c.JSONPath, offset)
}
}
type download struct {
url string
dest string
offset int
volumeID int
issueID int
finished bool
}
func (c *CVDownloader) start_downloader() {
for i := range 5 {
go func() {
log.Println("starting downloader", i)
for dl := range c.imageDownloads {
if c.hasQuit() {
c.imageWG.Done()
continue // We must continue so that c.imageWG will complete otherwise it will hang forever
}
if dl.finished {
select {
case <-c.Context.Done():
c.imageWG.Done()
continue
case c.FinishedDownloadQueue <- Download{
URL: dl.url,
Dest: dl.dest,
IssueID: strconv.Itoa(dl.issueID),
}:
c.imageWG.Done()
}
continue
}
2024-12-15 14:15:33 -08:00
dir := filepath.Dir(dl.dest)
2024-10-14 02:03:37 -07:00
resp, err, cancelDownload := Get(c.Context, dl.url)
if err != nil {
cancelDownload()
2024-12-15 14:15:33 -08:00
log.Println("Failed to download", dl.volumeID, "/", dl.issueID, dl.url, err)
2024-10-14 02:03:37 -07:00
c.imageWG.Done()
continue
}
cleanup := func() {
resp.Body.Close()
cancelDownload()
c.imageWG.Done()
}
if resp.StatusCode == 404 {
c.notFound <- dl
cleanup()
continue
}
if resp.StatusCode != 200 {
log.Println("Failed to download", dl.url, resp.StatusCode)
cleanup()
continue
}
2024-12-25 14:53:56 -08:00
if c.KeepDownloadedImages {
_ = os.MkdirAll(dir, 0o755)
2024-12-25 14:53:56 -08:00
image, err := os.Create(dl.dest)
if err != nil {
log.Println("Unable to create image file", dl.dest, err)
os.Remove(dl.dest)
image.Close()
cleanup()
continue
}
log.Println("downloading", dl.dest)
_, err = io.Copy(image, resp.Body)
image.Close()
if err != nil {
log.Println("Failed when downloading image", err)
os.Remove(dl.dest)
cleanup()
continue
}
c.FinishedDownloadQueue <- Download{
URL: dl.url,
Dest: dl.dest,
IssueID: strconv.Itoa(dl.issueID),
}
} else {
image := c.bufPool.Get().(*bytes.Buffer)
image.Reset()
2024-12-25 14:53:56 -08:00
log.Println("downloading", dl.dest)
_, err = io.Copy(image, resp.Body)
if err != nil {
log.Println("Failed when downloading image", err)
cleanup()
os.Remove(dl.dest)
// Something failed let this buffer GC instead of saving it
2024-12-25 14:53:56 -08:00
continue
}
2024-10-14 02:03:37 -07:00
2024-12-25 14:53:56 -08:00
c.FinishedDownloadQueue <- Download{
URL: dl.url,
Dest: dl.dest,
IssueID: strconv.Itoa(dl.issueID),
Image: image,
2024-12-25 14:53:56 -08:00
}
2024-10-14 02:03:37 -07:00
}
cleanup()
}
}()
}
}
func (c *CVDownloader) handleNotFound() {
for failedDownload := range c.notFound {
2024-12-15 14:15:33 -08:00
c.chdb.AddURL(failedDownload.url)
2024-10-14 02:03:37 -07:00
log.Printf("Not found: volumeID: %d issueID: %d Offset: %d URL: %s\n", failedDownload.volumeID, failedDownload.issueID, failedDownload.offset, failedDownload.url)
}
}
func (c *CVDownloader) downloadImages() {
defer func() {
log.Println("Waiting for final images to complete download")
c.imageWG.Wait()
}()
go c.start_downloader()
go c.handleNotFound()
added := 0
for list := range c.downloadQueue {
log.Printf("Checking downloads at offset %v\r", list.Offset)
for _, issue := range list.Results {
type i struct {
url string
name string
}
imageURLs := []i{{issue.Image.IconURL, "icon_url"}, {issue.Image.MediumURL, "medium_url"}, {issue.Image.ScreenURL, "screen_url"}, {issue.Image.ScreenLargeURL, "screen_large_url"}, {issue.Image.SmallURL, "small_url"}, {issue.Image.SuperURL, "super_url"}, {issue.Image.ThumbURL, "thumb_url"}, {issue.Image.TinyURL, "tiny_url"}, {issue.Image.OriginalURL, "original_url"}}
for _, image := range imageURLs {
if c.hasQuit() {
return
}
if len(c.ImageTypes) > 0 && !slices.Contains(c.ImageTypes, image.name) {
continue
}
2024-12-15 14:15:33 -08:00
if c.chdb.CheckURL(image.url) {
2024-10-14 02:03:37 -07:00
log.Printf("Skipping known bad url %s", image.url)
continue
}
uri, err := url.ParseRequestURI(image.url)
if err != nil {
c.notFound <- download{
url: image.url,
offset: list.Offset,
volumeID: issue.Volume.ID,
issueID: issue.ID,
finished: true,
}
}
ext := strings.TrimSuffix(strings.ToLower(path.Ext(uri.Path)), "~original")
if ext == "" || (len(ext) > 4 && !slices.Contains([]string{".avif", ".webp", ".tiff", ".heif"}, ext)) {
ext = ".jpg"
}
2024-12-15 14:15:33 -08:00
dir := filepath.Join(c.ImagePath, strconv.Itoa(issue.Volume.ID), strconv.Itoa(issue.ID))
path := filepath.Join(dir, image.name+ext)
2024-10-14 02:03:37 -07:00
2024-12-15 14:15:33 -08:00
if c.chdb.PathDownloaded(path) {
if _, err = os.Stat(path); c.SendExistingImages && err == nil {
2024-10-14 02:03:37 -07:00
// We don't add to the count of added as these should be processed immediately
2024-12-15 14:15:33 -08:00
log.Printf("Sending Existing image %v/%v %v", issue.Volume.ID, issue.ID, path)
2024-10-14 02:03:37 -07:00
c.imageWG.Add(1)
c.imageDownloads <- download{
url: image.url,
dest: path,
offset: list.Offset,
volumeID: issue.Volume.ID,
issueID: issue.ID,
finished: true,
}
}
continue // If it exists assume it is fine, adding some basic verification might be a good idea later
}
added++
2024-12-15 14:15:33 -08:00
2024-10-14 02:03:37 -07:00
c.imageWG.Add(1)
c.imageDownloads <- download{
url: image.url,
dest: path,
offset: list.Offset,
volumeID: issue.Volume.ID,
issueID: issue.ID,
}
}
if added > 200 {
// On a clean single image type run each page would have 100 downloads of a single cover type but stuff happens so we only wait once we have sent 200 to the queue
log.Println("waiting for", added, "downloads at offset", list.Offset)
beforeWait := time.Now()
c.imageWG.Wait()
waited := time.Since(beforeWait)
added = 0
2024-12-25 14:53:56 -08:00
// If we had to wait for the arbitrarily picked time of 7.4 seconds it means we had a backed up queue (slow hashing can also cause it to wait longer), lets wait to give the CV servers a break
2024-10-14 02:03:37 -07:00
if waited > time.Duration(7.4*float64(time.Second)) {
t := 10 * time.Second
log.Println("Waiting for", t, "at offset", list.Offset, "had to wait for", waited)
select {
case <-c.Context.Done(): // allows us to return immediately even during a timeout
return
case <-time.After(t):
}
} else {
// Things are too fast we can't depend CV being slow to manage our download speed
// We sleep for 3 seconds so we don't overload CV
time.Sleep(3 * time.Second)
2024-10-14 02:03:37 -07:00
}
}
}
}
}
func (c *CVDownloader) cleanBadURLs() error {
var indexesToRemove []int
list:
for i, jsonFile := range c.fileList {
list, err := c.loadIssues(jsonFile)
if err != nil {
indexesToRemove = append(indexesToRemove, i)
2025-01-11 15:26:35 -08:00
os.Remove(filepath.Join(c.JSONPath, jsonFile))
2024-10-14 02:03:37 -07:00
continue
}
for _, issue := range list.Results {
for _, url := range []string{issue.Image.IconURL, issue.Image.MediumURL, issue.Image.ScreenURL, issue.Image.ScreenLargeURL, issue.Image.SmallURL, issue.Image.SuperURL, issue.Image.ThumbURL, issue.Image.TinyURL, issue.Image.OriginalURL} {
if c.hasQuit() {
return ErrQuit
}
2024-12-15 14:15:33 -08:00
if c.chdb.CheckURL(url) {
2024-10-14 02:03:37 -07:00
indexesToRemove = append(indexesToRemove, i)
2025-01-11 15:26:35 -08:00
if err := os.Remove(filepath.Join(c.JSONPath, jsonFile)); err != nil {
2024-10-14 02:03:37 -07:00
return err
}
// We've removed the entire page, lets see if the new url works
continue list
}
}
}
}
slices.Reverse(indexesToRemove)
for _, i := range indexesToRemove {
c.fileList = slices.Delete(c.fileList, i, min(i+1, len(c.fileList)-1))
}
return nil
}
func (c *CVDownloader) hasQuit() bool {
select {
case <-c.Context.Done():
return true
default:
return false
}
}
2024-12-25 14:53:56 -08:00
func (c *CVDownloader) cleanDirs() {
_ = filepath.WalkDir(c.ImagePath, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
path, _ = filepath.Abs(path)
err := ch.RmdirP(path)
// The error is only for the first path value. EG ch.RmdirP("/test/t") will only return the error for os.Remove("/test/t") not os.Remove("test")
if err == nil {
return filepath.SkipDir
}
}
return nil
})
}
func NewCVDownloader(ctx context.Context, bufPool *sync.Pool, chdb ch.CHDB, workPath, APIKey string, imageTypes []string, keepDownloadedImages, sendExistingImages bool, finishedDownloadQueue chan Download) *CVDownloader {
2024-10-14 02:03:37 -07:00
return &CVDownloader{
Context: ctx,
JSONPath: filepath.Join(workPath, "_json"),
ImagePath: filepath.Join(workPath, "_image"),
APIKey: APIKey,
downloadQueue: make(chan *CVResult, 100), // This is just json it shouldn't take up much more than 122 MB
imageDownloads: make(chan download, 1), // These are just URLs should only take a few MB
notFound: make(chan download, 1), // Same here
bufPool: bufPool, // Only used if keepDownloadedImages is false to save space on byte buffers. The buffers get sent back via finishedDownloadQueue
2024-10-14 02:03:37 -07:00
FinishedDownloadQueue: finishedDownloadQueue,
SendExistingImages: sendExistingImages,
2024-12-25 14:53:56 -08:00
KeepDownloadedImages: keepDownloadedImages,
2024-10-14 02:03:37 -07:00
ImageTypes: imageTypes,
2024-12-15 14:15:33 -08:00
chdb: chdb,
2024-10-14 02:03:37 -07:00
}
}
func DownloadCovers(c *CVDownloader) {
var (
err error
)
os.MkdirAll(c.JSONPath, 0o777)
2024-12-25 14:53:56 -08:00
f, _ := os.Create(filepath.Join(c.ImagePath, ".keep"))
f.Close()
2025-01-11 15:26:35 -08:00
if !c.KeepDownloadedImages {
log.Println("Cleaning directories")
c.cleanDirs()
}
log.Println("Reading json")
var d *os.File
d, err = os.Open(c.JSONPath)
c.fileList, err = d.Readdirnames(-1)
2024-10-14 02:03:37 -07:00
if err != nil {
panic(fmt.Errorf("Unable to open path for json files: %w", err))
}
2025-01-11 15:26:35 -08:00
slices.SortFunc(c.fileList, func(x, y string) int {
return cmp.Compare(getOffset(x), getOffset(y))
2024-10-14 02:03:37 -07:00
})
if len(c.fileList) > 0 {
2025-01-11 15:26:35 -08:00
c.totalResults = getOffset(c.fileList[len(c.fileList)-1])
2024-10-14 02:03:37 -07:00
}
c.totalResults += 100
log.Println("Number of pages", len(c.fileList), "Expected Pages:", c.totalResults/100)
log.Println("Updating issues now")
dwg := sync.WaitGroup{}
dwg.Add(1)
go func() {
c.downloadImages()
dwg.Done()
}()
c.updateIssues()
issueCount := len(c.fileList) * 100
log.Println("Number of issues", issueCount, " expected:", c.totalResults)
2024-12-25 14:53:56 -08:00
close(c.downloadQueue) // sends only happen in c.updateIssues which has already been called
// We don't drain here as we want to process them
2024-10-14 02:03:37 -07:00
log.Println("Waiting for downloaders")
dwg.Wait()
close(c.imageDownloads)
for range c.imageDownloads {
}
close(c.notFound)
for range c.notFound {
}
2024-12-25 14:53:56 -08:00
// We drain this at the end because we need to wait for the images to download
for range c.downloadQueue {
}
2024-10-14 02:03:37 -07:00
log.Println("Completed downloading images")
}