various fixes after merging comicstream-integr
This commit is contained in:
parent
259769ae59
commit
65bc97f3f6
@ -12,7 +12,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
|
|||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
You may obtain a copy of the License at
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
Unless required by applicable law or agreed to in writing, software
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
@ -31,247 +31,256 @@ from urllib import unquote
|
|||||||
|
|
||||||
class FileNameParser:
|
class FileNameParser:
|
||||||
|
|
||||||
def repl(self, m):
|
def repl(self, m):
|
||||||
return ' ' * len(m.group())
|
return ' ' * len(m.group())
|
||||||
|
|
||||||
def fixSpaces( self, string, remove_dashes=True ):
|
def fixSpaces( self, string, remove_dashes=True ):
|
||||||
if remove_dashes:
|
if remove_dashes:
|
||||||
placeholders = ['[-_]',' +']
|
placeholders = ['[-_]',' +']
|
||||||
else:
|
else:
|
||||||
placeholders = ['[_]',' +']
|
placeholders = ['[_]',' +']
|
||||||
for ph in placeholders:
|
for ph in placeholders:
|
||||||
string = re.sub(ph, self.repl, string )
|
string = re.sub(ph, self.repl, string )
|
||||||
return string #.strip()
|
return string #.strip()
|
||||||
|
|
||||||
|
|
||||||
def getIssueCount( self,filename, issue_end ):
|
def getIssueCount( self,filename, issue_end ):
|
||||||
|
|
||||||
count = ""
|
count = ""
|
||||||
filename = filename[issue_end:]
|
filename = filename[issue_end:]
|
||||||
|
|
||||||
# replace any name seperators with spaces
|
|
||||||
tmpstr = self.fixSpaces(filename)
|
|
||||||
found = False
|
|
||||||
|
|
||||||
match = re.search('(?<=\sof\s)\d+(?=\s)', tmpstr, re.IGNORECASE)
|
|
||||||
if match:
|
|
||||||
count = match.group()
|
|
||||||
found = True
|
|
||||||
|
|
||||||
if not found:
|
# replace any name seperators with spaces
|
||||||
match = re.search('(?<=\(of\s)\d+(?=\))', tmpstr, re.IGNORECASE)
|
tmpstr = self.fixSpaces(filename)
|
||||||
if match:
|
found = False
|
||||||
count = match.group()
|
|
||||||
found = True
|
|
||||||
|
|
||||||
|
|
||||||
count = count.lstrip("0")
|
match = re.search('(?<=\sof\s)\d+(?=\s)', tmpstr, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
count = match.group()
|
||||||
|
found = True
|
||||||
|
|
||||||
return count
|
if not found:
|
||||||
|
match = re.search('(?<=\(of\s)\d+(?=\))', tmpstr, re.IGNORECASE)
|
||||||
def getIssueNumber( self, filename ):
|
if match:
|
||||||
|
count = match.group()
|
||||||
|
found = True
|
||||||
|
|
||||||
# Returns a tuple of issue number string, and start and end indexs in the filename
|
|
||||||
# (The indexes will be used to split the string up for further parsing)
|
|
||||||
|
|
||||||
found = False
|
|
||||||
issue = ''
|
|
||||||
start = 0
|
|
||||||
end = 0
|
|
||||||
|
|
||||||
# first, look for multiple "--", this means it's formatted differently from most:
|
|
||||||
if "--" in filename:
|
|
||||||
# the pattern seems to be that anything to left of the first "--" is the series name followed by issue
|
|
||||||
filename = re.sub("--.*", self.repl, filename)
|
|
||||||
|
|
||||||
elif "__" in filename:
|
|
||||||
# the pattern seems to be that anything to left of the first "__" is the series name followed by issue
|
|
||||||
filename = re.sub("__.*", self.repl, filename)
|
|
||||||
|
|
||||||
filename = filename.replace("+", " ")
|
count = count.lstrip("0")
|
||||||
|
|
||||||
# replace parenthetical phrases with spaces
|
|
||||||
filename = re.sub( "\(.*?\)", self.repl, filename)
|
|
||||||
filename = re.sub( "\[.*?\]", self.repl, filename)
|
|
||||||
|
|
||||||
# replace any name seperators with spaces
|
return count
|
||||||
filename = self.fixSpaces(filename)
|
|
||||||
|
|
||||||
# remove any "of NN" phrase with spaces (problem: this could break on some titles)
|
def getIssueNumber( self, filename ):
|
||||||
filename = re.sub( "of [\d]+", self.repl, filename)
|
|
||||||
|
|
||||||
#print u"[{0}]".format(filename)
|
# Returns a tuple of issue number string, and start and end indexs in the filename
|
||||||
|
# (The indexes will be used to split the string up for further parsing)
|
||||||
# we should now have a cleaned up filename version with all the words in
|
|
||||||
# the same positions as original filename
|
|
||||||
|
|
||||||
# make a list of each word and its position
|
|
||||||
word_list = list()
|
|
||||||
for m in re.finditer("\S+", filename):
|
|
||||||
word_list.append( (m.group(0), m.start(), m.end()) )
|
|
||||||
|
|
||||||
# remove the first word, since it can't be the issue number
|
|
||||||
if len(word_list) > 1:
|
|
||||||
word_list = word_list[1:]
|
|
||||||
else:
|
|
||||||
#only one word?? just bail.
|
|
||||||
return issue, start, end
|
|
||||||
|
|
||||||
# Now try to search for the likely issue number word in the list
|
|
||||||
|
|
||||||
# first look for a word with "#" followed by digits with optional sufix
|
|
||||||
# this is almost certainly the issue number
|
|
||||||
for w in reversed(word_list):
|
|
||||||
if re.match("#[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]):
|
|
||||||
found = True
|
|
||||||
break
|
|
||||||
|
|
||||||
# same as above but w/o a '#', and only look at the last word in the list
|
found = False
|
||||||
if not found:
|
issue = ''
|
||||||
w = word_list[-1]
|
start = 0
|
||||||
if re.match("[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]):
|
end = 0
|
||||||
found = True
|
|
||||||
|
|
||||||
# now try to look for a # followed by any characters
|
|
||||||
if not found:
|
|
||||||
for w in reversed(word_list):
|
|
||||||
if re.match("#\S+", w[0]):
|
|
||||||
found = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if found:
|
|
||||||
issue = w[0]
|
|
||||||
start = w[1]
|
|
||||||
end = w[2]
|
|
||||||
if issue[0] == '#':
|
|
||||||
issue = issue[1:]
|
|
||||||
|
|
||||||
return issue, start, end
|
|
||||||
|
|
||||||
def getSeriesName(self, filename, issue_start ):
|
|
||||||
|
|
||||||
# use the issue number string index to split the filename string
|
|
||||||
|
|
||||||
if issue_start != 0:
|
|
||||||
filename = filename[:issue_start]
|
|
||||||
|
|
||||||
# in case there is no issue number, remove some obvious stuff
|
# first, look for multiple "--", this means it's formatted differently from most:
|
||||||
if "--" in filename:
|
if "--" in filename:
|
||||||
# the pattern seems to be that anything to left of the first "--" is the series name followed by issue
|
# the pattern seems to be that anything to left of the first "--" is the series name followed by issue
|
||||||
filename = re.sub("--.*", self.repl, filename)
|
filename = re.sub("--.*", self.repl, filename)
|
||||||
|
|
||||||
elif "__" in filename:
|
|
||||||
# the pattern seems to be that anything to left of the first "__" is the series name followed by issue
|
|
||||||
filename = re.sub("__.*", self.repl, filename)
|
|
||||||
|
|
||||||
filename = filename.replace("+", " ")
|
|
||||||
tmpstr = self.fixSpaces(filename, remove_dashes=False)
|
|
||||||
|
|
||||||
series = tmpstr
|
|
||||||
volume = ""
|
|
||||||
|
|
||||||
#save the last word
|
elif "__" in filename:
|
||||||
try:
|
# the pattern seems to be that anything to left of the first "__" is the series name followed by issue
|
||||||
last_word = series.split()[-1]
|
filename = re.sub("__.*", self.repl, filename)
|
||||||
except:
|
|
||||||
last_word = ""
|
|
||||||
|
|
||||||
# remove any parenthetical phrases
|
|
||||||
series = re.sub( "\(.*?\)", "", series)
|
|
||||||
|
|
||||||
# search for volume number
|
|
||||||
match = re.search('(.+)([vV]|[Vv][oO][Ll]\.?\s?)(\d+)\s*$', series)
|
|
||||||
if match:
|
|
||||||
series = match.group(1)
|
|
||||||
volume = match.group(3)
|
|
||||||
|
|
||||||
# if a volume wasn't found, see if the last word is a year in parentheses
|
|
||||||
# since that's a common way to designate the volume
|
|
||||||
if volume == "":
|
|
||||||
#match either (YEAR), (YEAR-), or (YEAR-YEAR2)
|
|
||||||
match = re.search("(\()(\d{4})(-(\d{4}|)|)(\))", last_word)
|
|
||||||
if match:
|
|
||||||
volume = match.group(2)
|
|
||||||
|
|
||||||
series = series.strip()
|
filename = filename.replace("+", " ")
|
||||||
|
|
||||||
# if we don't have an issue number (issue_start==0), look
|
# replace parenthetical phrases with spaces
|
||||||
# for hints i.e. "TPB", "one-shot", "OS", "OGN", etc that might
|
filename = re.sub( "\(.*?\)", self.repl, filename)
|
||||||
# be removed to help search online
|
filename = re.sub( "\[.*?\]", self.repl, filename)
|
||||||
if issue_start == 0:
|
|
||||||
one_shot_words = [ "tpb", "os", "one-shot", "ogn", "gn" ]
|
|
||||||
try:
|
|
||||||
last_word = series.split()[-1]
|
|
||||||
if last_word.lower() in one_shot_words:
|
|
||||||
series = series.rsplit(' ', 1)[0]
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return series, volume.strip()
|
|
||||||
|
|
||||||
def getYear( self,filename, issue_end):
|
# replace any name seperators with spaces
|
||||||
|
filename = self.fixSpaces(filename)
|
||||||
filename = filename[issue_end:]
|
|
||||||
|
|
||||||
year = ""
|
# remove any "of NN" phrase with spaces (problem: this could break on some titles)
|
||||||
# look for four digit number with "(" ")" or "--" around it
|
filename = re.sub( "of [\d]+", self.repl, filename)
|
||||||
match = re.search('(\(\d\d\d\d\))|(--\d\d\d\d--)', filename)
|
|
||||||
if match:
|
|
||||||
year = match.group()
|
|
||||||
# remove non-numerics
|
|
||||||
year = re.sub("[^0-9]", "", year)
|
|
||||||
return year
|
|
||||||
|
|
||||||
def getRemainder( self, filename, year, count, issue_end ):
|
#print u"[{0}]".format(filename)
|
||||||
|
|
||||||
#make a guess at where the the non-interesting stuff begins
|
|
||||||
remainder = ""
|
|
||||||
|
|
||||||
if "--" in filename:
|
|
||||||
remainder = filename.split("--",1)[1]
|
|
||||||
elif "__" in filename:
|
|
||||||
remainder = filename.split("__",1)[1]
|
|
||||||
elif issue_end != 0:
|
|
||||||
remainder = filename[issue_end:]
|
|
||||||
|
|
||||||
remainder = self.fixSpaces(remainder, remove_dashes=False)
|
# we should now have a cleaned up filename version with all the words in
|
||||||
if year != "":
|
# the same positions as original filename
|
||||||
remainder = remainder.replace(year,"",1)
|
|
||||||
if count != "":
|
|
||||||
remainder = remainder.replace("of "+count,"",1)
|
|
||||||
|
|
||||||
remainder = remainder.replace("()","")
|
|
||||||
|
|
||||||
return remainder.strip()
|
|
||||||
|
|
||||||
def parseFilename( self, filename ):
|
|
||||||
|
|
||||||
# remove the path
|
# make a list of each word and its position
|
||||||
filename = os.path.basename(filename)
|
word_list = list()
|
||||||
|
for m in re.finditer("\S+", filename):
|
||||||
|
word_list.append( (m.group(0), m.start(), m.end()) )
|
||||||
|
|
||||||
# remove the extension
|
# remove the first word, since it can't be the issue number
|
||||||
filename = os.path.splitext(filename)[0]
|
if len(word_list) > 1:
|
||||||
|
word_list = word_list[1:]
|
||||||
|
else:
|
||||||
|
#only one word?? just bail.
|
||||||
|
return issue, start, end
|
||||||
|
|
||||||
#url decode, just in case
|
# Now try to search for the likely issue number word in the list
|
||||||
filename = unquote(filename)
|
|
||||||
|
|
||||||
# sometimes archives get messed up names from too many decodings
|
# first look for a word with "#" followed by digits with optional sufix
|
||||||
# often url encodings will break and leave "_28" and "_29" in place
|
# this is almost certainly the issue number
|
||||||
# of "(" and ")" see if there are a number of these, and replace them
|
for w in reversed(word_list):
|
||||||
if filename.count("_28") > 1 and filename.count("_29") > 1:
|
if re.match("#[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]):
|
||||||
filename = filename.replace("_28", "(")
|
found = True
|
||||||
filename = filename.replace("_29", ")")
|
break
|
||||||
|
|
||||||
self.issue, issue_start, issue_end = self.getIssueNumber(filename)
|
# same as above but w/o a '#', and only look at the last word in the list
|
||||||
self.series, self.volume = self.getSeriesName(filename, issue_start)
|
if not found:
|
||||||
self.year = self.getYear(filename, issue_end)
|
w = word_list[-1]
|
||||||
self.issue_count = self.getIssueCount(filename, issue_end)
|
if re.match("[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]):
|
||||||
self.remainder = self.getRemainder( filename, self.year, self.issue_count, issue_end )
|
found = True
|
||||||
|
|
||||||
if self.issue != "":
|
# now try to look for a # followed by any characters
|
||||||
# strip off leading zeros
|
if not found:
|
||||||
self.issue = self.issue.lstrip("0")
|
for w in reversed(word_list):
|
||||||
if self.issue == "":
|
if re.match("#\S+", w[0]):
|
||||||
self.issue = "0"
|
found = True
|
||||||
if self.issue[0] == ".":
|
break
|
||||||
self.issue = "0" + self.issue
|
|
||||||
|
if found:
|
||||||
|
issue = w[0]
|
||||||
|
start = w[1]
|
||||||
|
end = w[2]
|
||||||
|
if issue[0] == '#':
|
||||||
|
issue = issue[1:]
|
||||||
|
|
||||||
|
return issue, start, end
|
||||||
|
|
||||||
|
def getSeriesName(self, filename, issue_start ):
|
||||||
|
|
||||||
|
# use the issue number string index to split the filename string
|
||||||
|
|
||||||
|
if issue_start != 0:
|
||||||
|
filename = filename[:issue_start]
|
||||||
|
|
||||||
|
# in case there is no issue number, remove some obvious stuff
|
||||||
|
if "--" in filename:
|
||||||
|
# the pattern seems to be that anything to left of the first "--" is the series name followed by issue
|
||||||
|
filename = re.sub("--.*", self.repl, filename)
|
||||||
|
|
||||||
|
elif "__" in filename:
|
||||||
|
# the pattern seems to be that anything to left of the first "__" is the series name followed by issue
|
||||||
|
filename = re.sub("__.*", self.repl, filename)
|
||||||
|
|
||||||
|
filename = filename.replace("+", " ")
|
||||||
|
tmpstr = self.fixSpaces(filename, remove_dashes=False)
|
||||||
|
|
||||||
|
series = tmpstr
|
||||||
|
volume = ""
|
||||||
|
|
||||||
|
#save the last word
|
||||||
|
try:
|
||||||
|
last_word = series.split()[-1]
|
||||||
|
except:
|
||||||
|
last_word = ""
|
||||||
|
|
||||||
|
# remove any parenthetical phrases
|
||||||
|
series = re.sub( "\(.*?\)", "", series)
|
||||||
|
|
||||||
|
# search for volume number
|
||||||
|
match = re.search('(.+)([vV]|[Vv][oO][Ll]\.?\s?)(\d+)\s*$', series)
|
||||||
|
if match:
|
||||||
|
series = match.group(1)
|
||||||
|
volume = match.group(3)
|
||||||
|
|
||||||
|
# if a volume wasn't found, see if the last word is a year in parentheses
|
||||||
|
# since that's a common way to designate the volume
|
||||||
|
if volume == "":
|
||||||
|
#match either (YEAR), (YEAR-), or (YEAR-YEAR2)
|
||||||
|
match = re.search("(\()(\d{4})(-(\d{4}|)|)(\))", last_word)
|
||||||
|
if match:
|
||||||
|
volume = match.group(2)
|
||||||
|
|
||||||
|
series = series.strip()
|
||||||
|
|
||||||
|
# if we don't have an issue number (issue_start==0), look
|
||||||
|
# for hints i.e. "TPB", "one-shot", "OS", "OGN", etc that might
|
||||||
|
# be removed to help search online
|
||||||
|
if issue_start == 0:
|
||||||
|
one_shot_words = [ "tpb", "os", "one-shot", "ogn", "gn" ]
|
||||||
|
try:
|
||||||
|
last_word = series.split()[-1]
|
||||||
|
if last_word.lower() in one_shot_words:
|
||||||
|
series = series.rsplit(' ', 1)[0]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return series, volume.strip()
|
||||||
|
|
||||||
|
def getYear( self,filename, issue_end):
|
||||||
|
|
||||||
|
filename = filename[issue_end:]
|
||||||
|
|
||||||
|
year = ""
|
||||||
|
# look for four digit number with "(" ")" or "--" around it
|
||||||
|
match = re.search('(\(\d\d\d\d\))|(--\d\d\d\d--)', filename)
|
||||||
|
if match:
|
||||||
|
year = match.group()
|
||||||
|
# remove non-numerics
|
||||||
|
year = re.sub("[^0-9]", "", year)
|
||||||
|
return year
|
||||||
|
|
||||||
|
def getRemainder( self, filename, year, count, volume, issue_end ):
|
||||||
|
|
||||||
|
#make a guess at where the the non-interesting stuff begins
|
||||||
|
remainder = ""
|
||||||
|
|
||||||
|
if "--" in filename:
|
||||||
|
remainder = filename.split("--",1)[1]
|
||||||
|
elif "__" in filename:
|
||||||
|
remainder = filename.split("__",1)[1]
|
||||||
|
elif issue_end != 0:
|
||||||
|
remainder = filename[issue_end:]
|
||||||
|
|
||||||
|
remainder = self.fixSpaces(remainder, remove_dashes=False)
|
||||||
|
if volume != "":
|
||||||
|
remainder = remainder.replace("Vol."+volume,"",1)
|
||||||
|
if year != "":
|
||||||
|
remainder = remainder.replace(year,"",1)
|
||||||
|
if count != "":
|
||||||
|
remainder = remainder.replace("of "+count,"",1)
|
||||||
|
|
||||||
|
remainder = remainder.replace("()","")
|
||||||
|
remainder = remainder.replace(" "," ") # cleans some whitespace mess
|
||||||
|
|
||||||
|
return remainder.strip()
|
||||||
|
|
||||||
|
def parseFilename( self, filename ):
|
||||||
|
|
||||||
|
# remove the path
|
||||||
|
filename = os.path.basename(filename)
|
||||||
|
|
||||||
|
# remove the extension
|
||||||
|
filename = os.path.splitext(filename)[0]
|
||||||
|
|
||||||
|
#url decode, just in case
|
||||||
|
filename = unquote(filename)
|
||||||
|
|
||||||
|
# sometimes archives get messed up names from too many decodings
|
||||||
|
# often url encodings will break and leave "_28" and "_29" in place
|
||||||
|
# of "(" and ")" see if there are a number of these, and replace them
|
||||||
|
if filename.count("_28") > 1 and filename.count("_29") > 1:
|
||||||
|
filename = filename.replace("_28", "(")
|
||||||
|
filename = filename.replace("_29", ")")
|
||||||
|
|
||||||
|
self.issue, issue_start, issue_end = self.getIssueNumber(filename)
|
||||||
|
self.series, self.volume = self.getSeriesName(filename, issue_start)
|
||||||
|
|
||||||
|
|
||||||
|
# provides proper value when the filename doesn't have a issue number
|
||||||
|
if issue_end == 0:
|
||||||
|
issue_end=len(self.series)
|
||||||
|
|
||||||
|
self.year = self.getYear(filename, issue_end)
|
||||||
|
self.issue_count = self.getIssueCount(filename, issue_end)
|
||||||
|
self.remainder = self.getRemainder( filename, self.year, self.issue_count, self.volume, issue_end )
|
||||||
|
|
||||||
|
if self.issue != "":
|
||||||
|
# strip off leading zeros
|
||||||
|
self.issue = self.issue.lstrip("0")
|
||||||
|
if self.issue == "":
|
||||||
|
self.issue = "0"
|
||||||
|
if self.issue[0] == ".":
|
||||||
|
self.issue = "0" + self.issue
|
||||||
|
Loading…
x
Reference in New Issue
Block a user