Add experimental word splitting to the filename parser

Adds a global setting as well as a setting that is only in effect
during auto-tagging
This commit is contained in:
lordwelch 2021-08-29 17:09:57 -07:00 committed by Timmy Welch
parent 444e67100c
commit cdeca34791
8 changed files with 62 additions and 14 deletions

View File

@ -16,6 +16,7 @@
import zipfile
import os
import os.path
import struct
import sys
import tempfile
@ -23,6 +24,7 @@ import subprocess
import platform
import time
import io
import wordninja
import natsort
from PyPDF2 import PdfFileReader
@ -1080,11 +1082,14 @@ class ComicArchive:
data = self.getPage(idx)
p['ImageSize'] = str(len(data))
def metadataFromFilename(self, parse_scan_info=True):
def metadataFromFilename(self, parse_scan_info=True, split_words=False):
metadata = GenericMetadata()
fnp = FileNameParser()
fnp.parseFilename(self.path)
filename = self.path
if split_words:
filename = " ".join(wordninja.split(os.path.splitext(os.path.basename(self.path))[0]))
fnp.parseFilename(filename)
if fnp.issue != "":
metadata.issue = fnp.issue

View File

@ -180,10 +180,10 @@ class FileNameParser:
series = re.sub("\(.*?\)", "", series)
# search for volume number
match = re.search('(.+)([vV]|[Vv][oO][Ll]\.?\s?)(\d+)\s*$', series)
match = re.search('(.+)([vV]|[Vv][oO][Ll]\.?\s?)(\d+)?\s*$', series)
if match:
series = match.group(1)
volume = match.group(3)
volume = match.group(3) or ""
# if a volume wasn't found, see if the last word is a year in parentheses
# since that's a common way to designate the volume
@ -283,6 +283,9 @@ class FileNameParser:
self.volume,
issue_end)
if self.issue == "" and self.volume != "":
self.issue = self.volume
if self.issue != "":
# strip off leading zeros
self.issue = self.issue.lstrip("0")

View File

@ -45,6 +45,7 @@ class AutoTagStartWindow(QtWidgets.QDialog):
QtCore.Qt.Unchecked)
self.cbxRemoveAfterSuccess.setCheckState(QtCore.Qt.Unchecked)
self.cbxSpecifySearchString.setCheckState(QtCore.Qt.Unchecked)
self.cbxSplitWords.setCheckState(QtCore.Qt.Unchecked)
self.leNameLengthMatchTolerance.setText(
str(self.settings.id_length_delta_thresh))
self.leSearchString.setEnabled(False)
@ -62,6 +63,8 @@ class AutoTagStartWindow(QtWidgets.QDialog):
self.cbxRemoveAfterSuccess.setCheckState(QtCore.Qt.Checked)
if self.settings.wait_and_retry_on_rate_limit:
self.cbxWaitForRateLimit.setCheckState(QtCore.Qt.Checked)
if self.settings.split_words:
self.cbxSplitWords.setCheckState(QtCore.Qt.Checked)
nlmtTip = (
""" <html>The <b>Name Length Match Tolerance</b> is for eliminating automatic
@ -96,6 +99,7 @@ class AutoTagStartWindow(QtWidgets.QDialog):
self.waitAndRetryOnRateLimit = False
self.searchString = None
self.nameLengthMatchTolerance = self.settings.id_length_delta_thresh
self.splitWords = self.cbxSplitWords.isChecked()
def searchStringToggle(self):
enable = self.cbxSpecifySearchString.isChecked()
@ -112,6 +116,7 @@ class AutoTagStartWindow(QtWidgets.QDialog):
self.nameLengthMatchTolerance = int(
self.leNameLengthMatchTolerance.text())
self.waitAndRetryOnRateLimit = self.cbxWaitForRateLimit.isChecked()
self.splitWords = self.cbxSplitWords.isChecked()
# persist some settings
self.settings.save_on_low_confidence = self.autoSaveOnLow

View File

@ -88,6 +88,7 @@ class ComicTaggerSettings:
# filename parsing settings
self.parse_scan_info = True
self.split_words = False
# Comic Vine settings
self.use_series_start_as_volume = False
@ -229,6 +230,9 @@ class ComicTaggerSettings:
if self.config.has_option('filenameparser', 'parse_scan_info'):
self.parse_scan_info = self.config.getboolean(
'filenameparser', 'parse_scan_info')
if self.config.has_option('filenameparser', 'split_words'):
self.split_words = self.config.getboolean(
'filenameparser', 'split_words')
if self.config.has_option('dialogflags', 'ask_about_cbi_in_rar'):
self.ask_about_cbi_in_rar = self.config.getboolean(
@ -396,6 +400,8 @@ class ComicTaggerSettings:
self.config.set(
'filenameparser', 'parse_scan_info', self.parse_scan_info)
self.config.set(
'filenameparser', 'split_words', self.parse_scan_info)
if not self.config.has_section('comicvine'):
self.config.add_section('comicvine')

View File

@ -128,6 +128,8 @@ class SettingsWindow(QtWidgets.QDialog):
if self.settings.parse_scan_info:
self.cbxParseScanInfo.setCheckState(QtCore.Qt.Checked)
if self.settings.split_words:
self.cbxSplitWords.setCheckState(QtCore.Qt.Checked)
if self.settings.use_series_start_as_volume:
self.cbxUseSeriesStartAsVolume.setCheckState(QtCore.Qt.Checked)
@ -189,6 +191,7 @@ class SettingsWindow(QtWidgets.QDialog):
self.tePublisherBlacklist.toPlainText())
self.settings.parse_scan_info = self.cbxParseScanInfo.isChecked()
self.settings.split_words = self.cbxSplitWords.isChecked()
self.settings.use_series_start_as_volume = self.cbxUseSeriesStartAsVolume.isChecked()
self.settings.clear_form_before_populating_from_cv = self.cbxClearFormBeforePopulating.isChecked()

View File

@ -595,7 +595,7 @@ class TaggerWindow(QtWidgets.QMainWindow):
def actualLoadCurrentArchive(self):
if self.metadata.isEmpty:
self.metadata = self.comic_archive.metadataFromFilename(
self.settings.parse_scan_info)
self.settings.parse_scan_info, self.settings.split_words)
if len(self.metadata.pages) == 0:
self.metadata.setDefaultPageList(
self.comic_archive.getNumberOfPages())
@ -965,7 +965,7 @@ class TaggerWindow(QtWidgets.QMainWindow):
# copy the form onto metadata object
self.formToMetadata()
new_metadata = self.comic_archive.metadataFromFilename(
self.settings.parse_scan_info)
self.settings.parse_scan_info, self.settings.split_words)
if new_metadata is not None:
self.metadata.overlay(new_metadata)
self.metadataToForm()
@ -1727,14 +1727,14 @@ class TaggerWindow(QtWidgets.QMainWindow):
QtCore.QCoreApplication.processEvents()
QtCore.QCoreApplication.processEvents()
def identifyAndTagSingleArchive(self, ca, match_results, dlg):
def identifyAndTagSingleArchive(self, ca, match_results, dlg, split_words):
success = False
ii = IssueIdentifier(ca, self.settings)
# read in metadata, and parse file name if not there
md = ca.readMetadata(self.save_data_style)
if md.isEmpty:
md = ca.metadataFromFilename(self.settings.parse_scan_info)
md = ca.metadataFromFilename(self.settings.parse_scan_info, split_words)
if dlg.ignoreLeadingDigitsInFilename and md.series is not None:
# remove all leading numbers
md.series = re.sub("([\d.]*)(.*)", "\\2", md.series)
@ -1892,7 +1892,7 @@ class TaggerWindow(QtWidgets.QMainWindow):
if ca.isWritable():
success, match_results = self.identifyAndTagSingleArchive(
ca, match_results, atstartdlg)
ca, match_results, atstartdlg, atstartdlg.splitWords)
if success and atstartdlg.removeAfterSuccess:
archives_to_remove.append(ca)

View File

@ -10,7 +10,7 @@
<x>0</x>
<y>0</y>
<width>519</width>
<height>378</height>
<height>420</height>
</rect>
</property>
<property name="sizePolicy">
@ -44,7 +44,7 @@
</item>
<item row="1" column="0">
<layout class="QGridLayout" name="gridLayout">
<item row="6" column="0">
<item row="7" column="0">
<widget class="QCheckBox" name="cbxSpecifySearchString">
<property name="sizePolicy">
<sizepolicy hsizetype="Minimum" vsizetype="Fixed">
@ -96,6 +96,19 @@
</property>
</widget>
</item>
<item row="6" column="0">
<widget class="QCheckBox" name="cbxSplitWords">
<property name="sizePolicy">
<sizepolicy hsizetype="Minimum" vsizetype="Fixed">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="text">
<string>Split words in filenames (e.g. 'judgedredd' to 'judge dredd') (Experimental)</string>
</property>
</widget>
</item>
<item row="3" column="0">
<widget class="QCheckBox" name="cbxWaitForRateLimit">
<property name="text">
@ -129,7 +142,7 @@
</property>
</widget>
</item>
<item row="10" column="0">
<item row="11" column="0">
<widget class="QLineEdit" name="leNameLengthMatchTolerance">
<property name="sizePolicy">
<sizepolicy hsizetype="Expanding" vsizetype="Fixed">
@ -145,7 +158,7 @@
</property>
</widget>
</item>
<item row="7" column="0">
<item row="8" column="0">
<widget class="QLineEdit" name="leSearchString">
<property name="sizePolicy">
<sizepolicy hsizetype="Expanding" vsizetype="Fixed">
@ -155,7 +168,7 @@
</property>
</widget>
</item>
<item row="8" column="0">
<item row="9" column="0">
<widget class="QLabel" name="label_3">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Fixed">

View File

@ -225,6 +225,19 @@
<string>Parse Scan Info From Filename (Experimental)</string>
</property>
</widget>
<widget class="QCheckBox" name="cbxSplitWords">
<property name="geometry">
<rect>
<x>30</x>
<y>60</y>
<width>541</width>
<height>25</height>
</rect>
</property>
<property name="text">
<string>Split words apart in filenames (e.g. 'judgedredd' to 'judge dredd') (Experimental)</string>
</property>
</widget>
</widget>
<widget class="QWidget" name="tab_3">
<attribute name="title">