From cdeca3479167b1d9ad52dab63cd3a78c7a2746f3 Mon Sep 17 00:00:00 2001 From: lordwelch Date: Sun, 29 Aug 2021 17:09:57 -0700 Subject: [PATCH] Add experimental word splitting to the filename parser Adds a global setting as well as a setting that is only in effect during auto-tagging --- comicapi/comicarchive.py | 9 +++++++-- comicapi/filenameparser.py | 7 +++++-- comictaggerlib/autotagstartwindow.py | 5 +++++ comictaggerlib/settings.py | 6 ++++++ comictaggerlib/settingswindow.py | 3 +++ comictaggerlib/taggerwindow.py | 10 +++++----- comictaggerlib/ui/autotagstartwindow.ui | 23 ++++++++++++++++++----- comictaggerlib/ui/settingswindow.ui | 13 +++++++++++++ 8 files changed, 62 insertions(+), 14 deletions(-) diff --git a/comicapi/comicarchive.py b/comicapi/comicarchive.py index 02ee666..a9b2e42 100644 --- a/comicapi/comicarchive.py +++ b/comicapi/comicarchive.py @@ -16,6 +16,7 @@ import zipfile import os +import os.path import struct import sys import tempfile @@ -23,6 +24,7 @@ import subprocess import platform import time import io +import wordninja import natsort from PyPDF2 import PdfFileReader @@ -1080,11 +1082,14 @@ class ComicArchive: data = self.getPage(idx) p['ImageSize'] = str(len(data)) - def metadataFromFilename(self, parse_scan_info=True): + def metadataFromFilename(self, parse_scan_info=True, split_words=False): metadata = GenericMetadata() fnp = FileNameParser() - fnp.parseFilename(self.path) + filename = self.path + if split_words: + filename = " ".join(wordninja.split(os.path.splitext(os.path.basename(self.path))[0])) + fnp.parseFilename(filename) if fnp.issue != "": metadata.issue = fnp.issue diff --git a/comicapi/filenameparser.py b/comicapi/filenameparser.py index 476d14b..dd29e24 100644 --- a/comicapi/filenameparser.py +++ b/comicapi/filenameparser.py @@ -180,10 +180,10 @@ class FileNameParser: series = re.sub("\(.*?\)", "", series) # search for volume number - match = re.search('(.+)([vV]|[Vv][oO][Ll]\.?\s?)(\d+)\s*$', series) + match = re.search('(.+)([vV]|[Vv][oO][Ll]\.?\s?)(\d+)?\s*$', series) if match: series = match.group(1) - volume = match.group(3) + volume = match.group(3) or "" # if a volume wasn't found, see if the last word is a year in parentheses # since that's a common way to designate the volume @@ -283,6 +283,9 @@ class FileNameParser: self.volume, issue_end) + if self.issue == "" and self.volume != "": + self.issue = self.volume + if self.issue != "": # strip off leading zeros self.issue = self.issue.lstrip("0") diff --git a/comictaggerlib/autotagstartwindow.py b/comictaggerlib/autotagstartwindow.py index 653c369..feceb2b 100644 --- a/comictaggerlib/autotagstartwindow.py +++ b/comictaggerlib/autotagstartwindow.py @@ -45,6 +45,7 @@ class AutoTagStartWindow(QtWidgets.QDialog): QtCore.Qt.Unchecked) self.cbxRemoveAfterSuccess.setCheckState(QtCore.Qt.Unchecked) self.cbxSpecifySearchString.setCheckState(QtCore.Qt.Unchecked) + self.cbxSplitWords.setCheckState(QtCore.Qt.Unchecked) self.leNameLengthMatchTolerance.setText( str(self.settings.id_length_delta_thresh)) self.leSearchString.setEnabled(False) @@ -62,6 +63,8 @@ class AutoTagStartWindow(QtWidgets.QDialog): self.cbxRemoveAfterSuccess.setCheckState(QtCore.Qt.Checked) if self.settings.wait_and_retry_on_rate_limit: self.cbxWaitForRateLimit.setCheckState(QtCore.Qt.Checked) + if self.settings.split_words: + self.cbxSplitWords.setCheckState(QtCore.Qt.Checked) nlmtTip = ( """ The Name Length Match Tolerance is for eliminating automatic @@ -96,6 +99,7 @@ class AutoTagStartWindow(QtWidgets.QDialog): self.waitAndRetryOnRateLimit = False self.searchString = None self.nameLengthMatchTolerance = self.settings.id_length_delta_thresh + self.splitWords = self.cbxSplitWords.isChecked() def searchStringToggle(self): enable = self.cbxSpecifySearchString.isChecked() @@ -112,6 +116,7 @@ class AutoTagStartWindow(QtWidgets.QDialog): self.nameLengthMatchTolerance = int( self.leNameLengthMatchTolerance.text()) self.waitAndRetryOnRateLimit = self.cbxWaitForRateLimit.isChecked() + self.splitWords = self.cbxSplitWords.isChecked() # persist some settings self.settings.save_on_low_confidence = self.autoSaveOnLow diff --git a/comictaggerlib/settings.py b/comictaggerlib/settings.py index 8808788..0c3afd6 100644 --- a/comictaggerlib/settings.py +++ b/comictaggerlib/settings.py @@ -88,6 +88,7 @@ class ComicTaggerSettings: # filename parsing settings self.parse_scan_info = True + self.split_words = False # Comic Vine settings self.use_series_start_as_volume = False @@ -229,6 +230,9 @@ class ComicTaggerSettings: if self.config.has_option('filenameparser', 'parse_scan_info'): self.parse_scan_info = self.config.getboolean( 'filenameparser', 'parse_scan_info') + if self.config.has_option('filenameparser', 'split_words'): + self.split_words = self.config.getboolean( + 'filenameparser', 'split_words') if self.config.has_option('dialogflags', 'ask_about_cbi_in_rar'): self.ask_about_cbi_in_rar = self.config.getboolean( @@ -396,6 +400,8 @@ class ComicTaggerSettings: self.config.set( 'filenameparser', 'parse_scan_info', self.parse_scan_info) + self.config.set( + 'filenameparser', 'split_words', self.parse_scan_info) if not self.config.has_section('comicvine'): self.config.add_section('comicvine') diff --git a/comictaggerlib/settingswindow.py b/comictaggerlib/settingswindow.py index 0149a12..3ba519e 100644 --- a/comictaggerlib/settingswindow.py +++ b/comictaggerlib/settingswindow.py @@ -128,6 +128,8 @@ class SettingsWindow(QtWidgets.QDialog): if self.settings.parse_scan_info: self.cbxParseScanInfo.setCheckState(QtCore.Qt.Checked) + if self.settings.split_words: + self.cbxSplitWords.setCheckState(QtCore.Qt.Checked) if self.settings.use_series_start_as_volume: self.cbxUseSeriesStartAsVolume.setCheckState(QtCore.Qt.Checked) @@ -189,6 +191,7 @@ class SettingsWindow(QtWidgets.QDialog): self.tePublisherBlacklist.toPlainText()) self.settings.parse_scan_info = self.cbxParseScanInfo.isChecked() + self.settings.split_words = self.cbxSplitWords.isChecked() self.settings.use_series_start_as_volume = self.cbxUseSeriesStartAsVolume.isChecked() self.settings.clear_form_before_populating_from_cv = self.cbxClearFormBeforePopulating.isChecked() diff --git a/comictaggerlib/taggerwindow.py b/comictaggerlib/taggerwindow.py index be4ab8f..2b7bc92 100644 --- a/comictaggerlib/taggerwindow.py +++ b/comictaggerlib/taggerwindow.py @@ -595,7 +595,7 @@ class TaggerWindow(QtWidgets.QMainWindow): def actualLoadCurrentArchive(self): if self.metadata.isEmpty: self.metadata = self.comic_archive.metadataFromFilename( - self.settings.parse_scan_info) + self.settings.parse_scan_info, self.settings.split_words) if len(self.metadata.pages) == 0: self.metadata.setDefaultPageList( self.comic_archive.getNumberOfPages()) @@ -965,7 +965,7 @@ class TaggerWindow(QtWidgets.QMainWindow): # copy the form onto metadata object self.formToMetadata() new_metadata = self.comic_archive.metadataFromFilename( - self.settings.parse_scan_info) + self.settings.parse_scan_info, self.settings.split_words) if new_metadata is not None: self.metadata.overlay(new_metadata) self.metadataToForm() @@ -1727,14 +1727,14 @@ class TaggerWindow(QtWidgets.QMainWindow): QtCore.QCoreApplication.processEvents() QtCore.QCoreApplication.processEvents() - def identifyAndTagSingleArchive(self, ca, match_results, dlg): + def identifyAndTagSingleArchive(self, ca, match_results, dlg, split_words): success = False ii = IssueIdentifier(ca, self.settings) # read in metadata, and parse file name if not there md = ca.readMetadata(self.save_data_style) if md.isEmpty: - md = ca.metadataFromFilename(self.settings.parse_scan_info) + md = ca.metadataFromFilename(self.settings.parse_scan_info, split_words) if dlg.ignoreLeadingDigitsInFilename and md.series is not None: # remove all leading numbers md.series = re.sub("([\d.]*)(.*)", "\\2", md.series) @@ -1892,7 +1892,7 @@ class TaggerWindow(QtWidgets.QMainWindow): if ca.isWritable(): success, match_results = self.identifyAndTagSingleArchive( - ca, match_results, atstartdlg) + ca, match_results, atstartdlg, atstartdlg.splitWords) if success and atstartdlg.removeAfterSuccess: archives_to_remove.append(ca) diff --git a/comictaggerlib/ui/autotagstartwindow.ui b/comictaggerlib/ui/autotagstartwindow.ui index dc863d0..e5fdd36 100644 --- a/comictaggerlib/ui/autotagstartwindow.ui +++ b/comictaggerlib/ui/autotagstartwindow.ui @@ -10,7 +10,7 @@ 0 0 519 - 378 + 420 @@ -44,7 +44,7 @@ - + @@ -96,6 +96,19 @@ + + + + + 0 + 0 + + + + Split words in filenames (e.g. 'judgedredd' to 'judge dredd') (Experimental) + + + @@ -129,7 +142,7 @@ - + @@ -145,7 +158,7 @@ - + @@ -155,7 +168,7 @@ - + diff --git a/comictaggerlib/ui/settingswindow.ui b/comictaggerlib/ui/settingswindow.ui index c64f5f5..455b791 100644 --- a/comictaggerlib/ui/settingswindow.ui +++ b/comictaggerlib/ui/settingswindow.ui @@ -225,6 +225,19 @@ Parse Scan Info From Filename (Experimental) + + + + 30 + 60 + 541 + 25 + + + + Split words apart in filenames (e.g. 'judgedredd' to 'judge dredd') (Experimental) + +