"""Functions for parsing comic info from filename This should probably be re-written, but, well, it mostly works! """ # Copyright 2012-2014 Anthony Beville # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Some portions of this code were modified from pyComicMetaThis project # http://code.google.com/p/pycomicmetathis/ import os import re from urllib.parse import unquote class FileNameParser: def __init__(self): self.series = "" self.volume = "" self.year = "" self.issue_count = "" self.remainder = "" self.issue = "" def repl(self, m): return " " * len(m.group()) def fix_spaces(self, string, remove_dashes=True): if remove_dashes: placeholders = [r"[-_]", r" +"] else: placeholders = [r"[_]", r" +"] for ph in placeholders: string = re.sub(ph, self.repl, string) return string # .strip() def get_issue_count(self, filename, issue_end): count = "" filename = filename[issue_end:] # replace any name separators with spaces tmpstr = self.fix_spaces(filename) found = False match = re.search(r"(?<=\sof\s)\d+(?=\s)", tmpstr, re.IGNORECASE) if match: count = match.group() found = True if not found: match = re.search(r"(?<=\(of\s)\d+(?=\))", tmpstr, re.IGNORECASE) if match: count = match.group() count = count.lstrip("0") return count def get_issue_number(self, filename): """Returns a tuple of issue number string, and start and end indexes in the filename (The indexes will be used to split the string up for further parsing) """ found = False issue = "" start = 0 end = 0 # first, look for multiple "--", this means it's formatted differently # from most: if "--" in filename: # the pattern seems to be that anything to left of the first "--" # is the series name followed by issue filename = re.sub(r"--.*", self.repl, filename) elif "__" in filename: # the pattern seems to be that anything to left of the first "__" # is the series name followed by issue filename = re.sub(r"__.*", self.repl, filename) filename = filename.replace("+", " ") # replace parenthetical phrases with spaces filename = re.sub(r"\(.*?\)", self.repl, filename) filename = re.sub(r"\[.*?]", self.repl, filename) # replace any name separators with spaces filename = self.fix_spaces(filename) # remove any "of NN" phrase with spaces (problem: this could break on # some titles) filename = re.sub(r"of [\d]+", self.repl, filename) # print u"[{0}]".format(filename) # we should now have a cleaned up filename version with all the words in # the same positions as original filename # make a list of each word and its position word_list = [] for m in re.finditer(r"\S+", filename): word_list.append((m.group(0), m.start(), m.end())) # remove the first word, since it can't be the issue number if len(word_list) > 1: word_list = word_list[1:] else: # only one word?? just bail. return issue, start, end # Now try to search for the likely issue number word in the list # first look for a word with "#" followed by digits with optional suffix # this is almost certainly the issue number for w in reversed(word_list): if re.match(r"#[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]): found = True break # same as above but w/o a '#', and only look at the last word in the # list if not found: w = word_list[-1] if re.match(r"[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]): found = True # now try to look for a # followed by any characters if not found: for w in reversed(word_list): if re.match(r"#\S+", w[0]): found = True break if found: issue = w[0] start = w[1] end = w[2] if issue[0] == "#": issue = issue[1:] return issue, start, end def get_series_name(self, filename, issue_start): """Use the issue number string index to split the filename string""" if issue_start != 0: filename = filename[:issue_start] # in case there is no issue number, remove some obvious stuff if "--" in filename: # the pattern seems to be that anything to left of the first "--" # is the series name followed by issue filename = re.sub(r"--.*", self.repl, filename) elif "__" in filename: # the pattern seems to be that anything to left of the first "__" # is the series name followed by issue filename = re.sub(r"__.*", self.repl, filename) filename = filename.replace("+", " ") tmpstr = self.fix_spaces(filename, remove_dashes=False) series = tmpstr volume = "" # save the last word try: last_word = series.split()[-1] except: last_word = "" # remove any parenthetical phrases series = re.sub(r"\(.*?\)", "", series) # search for volume number match = re.search(r"(.+)([vV]|[Vv][oO][Ll]\.?\s?)(\d+)\s*$", series) if match: series = match.group(1) volume = match.group(3) # if a volume wasn't found, see if the last word is a year in parentheses # since that's a common way to designate the volume if volume == "": # match either (YEAR), (YEAR-), or (YEAR-YEAR2) match = re.search(r"(\()(\d{4})(-(\d{4}|)|)(\))", last_word) if match: volume = match.group(2) series = series.strip() # if we don't have an issue number (issue_start==0), look # for hints i.e. "TPB", "one-shot", "OS", "OGN", etc that might # be removed to help search online if issue_start == 0: one_shot_words = ["tpb", "os", "one-shot", "ogn", "gn"] try: last_word = series.split()[-1] if last_word.lower() in one_shot_words: series = series.rsplit(" ", 1)[0] except: pass return series, volume.strip() def get_year(self, filename, issue_end): filename = filename[issue_end:] year = "" # look for four digit number with "(" ")" or "--" around it match = re.search(r"(\(\d\d\d\d\))|(--\d\d\d\d--)", filename) if match: year = match.group() # remove non-digits year = re.sub(r"[^0-9]", "", year) return year def get_remainder(self, filename, year, count, volume, issue_end): """Make a guess at where the the non-interesting stuff begins""" remainder = "" if "--" in filename: remainder = filename.split("--", 1)[1] elif "__" in filename: remainder = filename.split("__", 1)[1] elif issue_end != 0: remainder = filename[issue_end:] remainder = self.fix_spaces(remainder, remove_dashes=False) if volume != "": remainder = remainder.replace("Vol." + volume, "", 1) if year != "": remainder = remainder.replace(year, "", 1) if count != "": remainder = remainder.replace("of " + count, "", 1) remainder = remainder.replace("()", "") remainder = remainder.replace(" ", " ") # cleans some whitespace mess return remainder.strip() def parse_filename(self, filename): # remove the path filename = os.path.basename(filename) # remove the extension filename = os.path.splitext(filename)[0] # url decode, just in case filename = unquote(filename) # sometimes archives get messed up names from too many decodes # often url encodings will break and leave "_28" and "_29" in place # of "(" and ")" see if there are a number of these, and replace them if filename.count("_28") > 1 and filename.count("_29") > 1: filename = filename.replace("_28", "(") filename = filename.replace("_29", ")") self.issue, issue_start, issue_end = self.get_issue_number(filename) self.series, self.volume = self.get_series_name(filename, issue_start) # provides proper value when the filename doesn't have a issue number if issue_end == 0: issue_end = len(self.series) self.year = self.get_year(filename, issue_end) self.issue_count = self.get_issue_count(filename, issue_end) self.remainder = self.get_remainder(filename, self.year, self.issue_count, self.volume, issue_end) if self.issue != "": # strip off leading zeros self.issue = self.issue.lstrip("0") if self.issue == "": self.issue = "0" if self.issue[0] == ".": self.issue = "0" + self.issue