#!/usr/bin/python # This is horrible, and needs to be re-written, but, well, it mostly works! import re import os from urllib import unquote class FileNameParser: def fixSpaces( self, string ): placeholders = ['[-_]',' +'] for ph in placeholders: string = re.sub(ph, ' ', string ) return string.strip() # check for silly .1 or .5 style issue strings # allow up to 5 chars total def isPointIssue( self, word ): ret = False try: float(word) if (len(word) < 5 and not word.isdigit()): ret = True except ValueError: pass return ret def getIssueNumber( self,filename ): found = False issue = '' # guess based on position # replace any name seperators with spaces tmpstr = self.fixSpaces(filename) word_list = tmpstr.split(' ') # assume the last number in the filename that is under 4 digits is the issue number for word in reversed(word_list): if ( (word.isdigit() and len(word) < 4) or (self.isPointIssue(word)) ): issue = word found = True #print 'Assuming issue number is ' + str(issue) + ' based on the position.' break if not found: # try a regex issnum = re.search('(?<=[_#\s-])(\d+[a-zA-Z]|\d+\.\d|\d+)', filename) if issnum: issue = issnum.group() found = True #print 'Got the issue using regex. Issue is ' + issue return issue.strip() def getSeriesName(self, filename, issue ): # use the issue number string to split the filename string # assume first element of list is the series name, plus cruft #!!! this could fail in the case of small numerics in the series name!!! tmpstr = self.fixSpaces(filename) if issue != "": series = tmpstr.split(issue)[0] else: # no issue to work off of #!!! TODO we should look for the year, and split from that # and if that doesn't exist, remove parenthetical words series = tmpstr volume = "" series = series.rstrip("#") # search for volume number match = re.search('(?<=v)(\d+)\s*$', series) if match: volume = match.group() series = series.split("v"+volume)[0] volume = volume.lstrip("0") return series.strip(), volume.strip() def getYear( self,filename): year = "" # look for four digit number with "(" ")" or "--" around it match = re.search('(\(\d\d\d\d\))|(--\d\d\d\d--)', filename) if match: year = match.group() # remove non-numerics year = re.sub("[^0-9]", "", year) return year self.issue = "" self.series = "" self.volume = "" self.year = "" def parseFilename( self, filename ): # remove the path filename = os.path.basename(filename) # remove the extension filename = os.path.splitext(filename)[0] #url decvocde, just in case filename = unquote(filename) self.issue = self.getIssueNumber(filename) self.series, self.volume = self.getSeriesName(filename, self.issue) self.year = self.getYear(filename) if self.issue != "": # strip off leading zeros self.issue = self.issue.lstrip("0") if self.issue == "": self.issue = "0"