various fixes after merging comicstream-integr

2015-02-16 16:19:38 +01:00 · 2015-02-16 16:19:38 +01:00 · 65bc97f3f6
commit 65bc97f3f6
parent 259769ae59
1 changed files with 230 additions and 221 deletions
--- a/comicapi/filenameparser.py
+++ b/comicapi/filenameparser.py
@ -12,7 +12,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-	http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@ -31,247 +31,256 @@ from urllib import unquote
 class FileNameParser:
-	def repl(self, m):
+    def repl(self, m):
-	   return ' ' * len(m.group())
+       return ' ' * len(m.group())
-	
+
-	def fixSpaces( self, string, remove_dashes=True ):
+    def fixSpaces( self, string, remove_dashes=True ):
-		if remove_dashes:
+        if remove_dashes:
-			placeholders = ['[-_]','  +']
+            placeholders = ['[-_]','  +']
-		else:
+        else:
-			placeholders = ['[_]','  +']			
+            placeholders = ['[_]','  +']
-		for ph in placeholders:
+        for ph in placeholders:
-			string = re.sub(ph, self.repl, string )
+            string = re.sub(ph, self.repl, string )
-		return string #.strip()
+        return string #.strip()
-	def getIssueCount( self,filename, issue_end ):
+    def getIssueCount( self,filename, issue_end ):
-		count = ""
+        count = ""
-		filename = filename[issue_end:]
+        filename = filename[issue_end:]
 		# replace any name seperators with spaces
 		tmpstr = self.fixSpaces(filename)
 		found = False
 		match = re.search('(?<=\sof\s)\d+(?=\s)', tmpstr, re.IGNORECASE)
 		if match:
 			count = match.group()
 			found = True
-		if not found:
+        # replace any name seperators with spaces
-			match = re.search('(?<=\(of\s)\d+(?=\))', tmpstr,  re.IGNORECASE)
+        tmpstr = self.fixSpaces(filename)
-			if match:
+        found = False
 				count = match.group()
 				found = True
-		count = count.lstrip("0")
+        match = re.search('(?<=\sof\s)\d+(?=\s)', tmpstr, re.IGNORECASE)
        if match:
            count = match.group()
            found = True
-		return count
+        if not found:
-	
+            match = re.search('(?<=\(of\s)\d+(?=\))', tmpstr,  re.IGNORECASE)
-	def getIssueNumber( self, filename ):
+            if match:
                count = match.group()
                found = True
 		# Returns a tuple of issue number string, and start and end indexs in the filename
 		# (The indexes will be used to split the string up for further parsing)
 		found = False
 		issue = ''
 		start = 0
 		end = 0
 		# first, look for multiple "--", this means it's formatted differently from most:
 		if "--" in filename:
 			# the pattern seems to be that anything to left of the first "--" is the series name followed by issue
 			filename = re.sub("--.*", self.repl, filename)	
 		elif "__" in filename:
 			# the pattern seems to be that anything to left of the first "__" is the series name followed by issue
 			filename = re.sub("__.*", self.repl, filename)	
-		filename = filename.replace("+", " ")
+        count = count.lstrip("0")
 		# replace parenthetical phrases with spaces
 		filename = re.sub( "\(.*?\)", self.repl, filename)
 		filename = re.sub( "\[.*?\]", self.repl, filename)
-		# replace any name seperators with spaces
+        return count
 		filename = self.fixSpaces(filename)
-		# remove any "of NN" phrase with spaces (problem: this could break on some titles)
+    def getIssueNumber( self, filename ):
 		filename = re.sub( "of [\d]+", self.repl, filename)
-		#print u"[{0}]".format(filename)
+        # Returns a tuple of issue number string, and start and end indexs in the filename
-		
+        # (The indexes will be used to split the string up for further parsing)
 		# we should now have a cleaned up filename version with all the words in
 		# the same positions as original filename
 		# make a list of each word and its position
 		word_list = list()
 		for m in re.finditer("\S+", filename):
 			word_list.append( (m.group(0), m.start(), m.end()) )
 		# remove the first word, since it can't be the issue number
 		if len(word_list) > 1:
 			word_list = word_list[1:]
 		else:
 			#only one word??  just bail.
 			return issue, start, end
 		# Now try to search for the likely issue number word in the list
 		# first look for a word with "#" followed by digits with optional sufix 
 		# this is almost certainly the issue number
 		for w in reversed(word_list):
 			if re.match("#[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]):
 				found = True
 				break
-		# same as above but w/o a '#', and only look at the last word in the list	
+        found = False
-		if not found:
+        issue = ''
-			w  = word_list[-1]
+        start = 0
-			if re.match("[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]):
+        end = 0
 				found = True
 		# now try to look for a # followed by any characters		
 		if not found:
 			for w in reversed(word_list):
 				if re.match("#\S+", w[0]):
 					found = True
 					break
 		if found:
 			issue = w[0]
 			start = w[1]
 			end = w[2]
 			if issue[0] == '#':
 				issue = issue[1:]
 		return issue, start, end
 	def getSeriesName(self, filename, issue_start ):
 		# use the issue number string index to split the filename string
 		if issue_start != 0:
 			filename = filename[:issue_start]
-		# in case there is no issue number, remove some obvious stuff
+        # first, look for multiple "--", this means it's formatted differently from most:
-		if "--" in filename:
+        if "--" in filename:
-			# the pattern seems to be that anything to left of the first "--" is the series name followed by issue
+            # the pattern seems to be that anything to left of the first "--" is the series name followed by issue
-			filename = re.sub("--.*", self.repl, filename)	
+            filename = re.sub("--.*", self.repl, filename)
 		elif "__" in filename:
 			# the pattern seems to be that anything to left of the first "__" is the series name followed by issue
 			filename = re.sub("__.*", self.repl, filename)
 		filename = filename.replace("+", " ")
 		tmpstr = self.fixSpaces(filename, remove_dashes=False)
 		series = tmpstr	
 		volume = ""
-		#save the last word
+        elif "__" in filename:
-		try: 
+            # the pattern seems to be that anything to left of the first "__" is the series name followed by issue
-			last_word = series.split()[-1]
+            filename = re.sub("__.*", self.repl, filename)
 		except:
 			last_word = ""			
 		# remove any parenthetical phrases
 		series = re.sub( "\(.*?\)", "", series)
 		# search for volume number
 		match = re.search('(.+)([vV]|[Vv][oO][Ll]\.?\s?)(\d+)\s*$', series)
 		if match:
 			series = match.group(1)
 			volume = match.group(3)
 		# if a volume wasn't found, see if the last word is a year in parentheses
 		# since that's a common way to designate the volume
 		if volume == "":
 			#match either (YEAR), (YEAR-), or (YEAR-YEAR2)
 			match = re.search("(\()(\d{4})(-(\d{4}|)|)(\))", last_word)
 			if match:
 				volume = match.group(2)
-		series = series.strip()
+        filename = filename.replace("+", " ")
-		# if we don't have an issue number (issue_start==0), look
+        # replace parenthetical phrases with spaces
-		# for hints i.e. "TPB", "one-shot", "OS", "OGN", etc that might
+        filename = re.sub( "\(.*?\)", self.repl, filename)
-		# be removed to help search online		
+        filename = re.sub( "\[.*?\]", self.repl, filename)
 		if issue_start == 0:
 			one_shot_words = [ "tpb", "os", "one-shot", "ogn", "gn" ]
 			try:
 				last_word = series.split()[-1]
 				if last_word.lower() in one_shot_words:
 					series = series.rsplit(' ', 1)[0]
 			except:
 				pass
 		return series, volume.strip()
-	def getYear( self,filename, issue_end):
+        # replace any name seperators with spaces
-		
+        filename = self.fixSpaces(filename)
 		filename = filename[issue_end:]
-		year = ""
+        # remove any "of NN" phrase with spaces (problem: this could break on some titles)
-		# look for four digit number with "(" ")" or "--" around it
+        filename = re.sub( "of [\d]+", self.repl, filename)
 		match = re.search('(\(\d\d\d\d\))|(--\d\d\d\d--)', filename)
 		if match:
 			year = match.group()
 			# remove non-numerics
 			year = re.sub("[^0-9]", "", year)
 		return year
-	def getRemainder( self, filename, year, count, issue_end ):
+        #print u"[{0}]".format(filename)
 		#make a guess at where the the non-interesting stuff begins
 		remainder = ""
 		if "--" in filename:
 			remainder = filename.split("--",1)[1]
 		elif "__" in filename:
 			remainder = filename.split("__",1)[1]
 		elif issue_end != 0:
 			remainder = filename[issue_end:]
-		remainder = self.fixSpaces(remainder, remove_dashes=False)
+        # we should now have a cleaned up filename version with all the words in
-		if year != "":
+        # the same positions as original filename
 			remainder = remainder.replace(year,"",1)
 		if count != "":
 			remainder = remainder.replace("of "+count,"",1)
 		remainder = remainder.replace("()","")
 		return remainder.strip()
 	def parseFilename( self, filename ):
-		# remove the path
+        # make a list of each word and its position
-		filename = os.path.basename(filename)
+        word_list = list()
        for m in re.finditer("\S+", filename):
            word_list.append( (m.group(0), m.start(), m.end()) )
-		# remove the extension
+        # remove the first word, since it can't be the issue number
-		filename = os.path.splitext(filename)[0]
+        if len(word_list) > 1:
            word_list = word_list[1:]
        else:
            #only one word??  just bail.
            return issue, start, end
-		#url decode, just in case
+        # Now try to search for the likely issue number word in the list
 		filename = unquote(filename)
-		# sometimes archives get messed up names from too many decodings
+        # first look for a word with "#" followed by digits with optional sufix
-		# often url encodings will break and leave "_28" and "_29" in place
+        # this is almost certainly the issue number
-		# of "(" and ")"  see if there are a number of these, and replace them
+        for w in reversed(word_list):
-		if filename.count("_28") > 1 and filename.count("_29") > 1:
+            if re.match("#[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]):
-			filename = filename.replace("_28", "(")
+                found = True
-			filename = filename.replace("_29", ")")
+                break
-					
+
-		self.issue, issue_start, issue_end = self.getIssueNumber(filename)
+        # same as above but w/o a '#', and only look at the last word in the list
-		self.series, self.volume = self.getSeriesName(filename, issue_start)
+        if not found:
-		self.year = self.getYear(filename, issue_end)
+            w  = word_list[-1]
-		self.issue_count = self.getIssueCount(filename, issue_end)
+            if re.match("[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]):
-		self.remainder = self.getRemainder( filename, self.year, self.issue_count, issue_end )
+                found = True
-	
+
-		if self.issue != "":
+        # now try to look for a # followed by any characters
-			# strip off leading zeros
+        if not found:
-			self.issue = self.issue.lstrip("0")
+            for w in reversed(word_list):
-			if self.issue == "":
+                if re.match("#\S+", w[0]):
-				self.issue = "0"
+                    found = True
-			if self.issue[0] == ".":
+                    break
-				self.issue = "0" + self.issue
+
        if found:
            issue = w[0]
            start = w[1]
            end = w[2]
            if issue[0] == '#':
                issue = issue[1:]
        return issue, start, end
    def getSeriesName(self, filename, issue_start ):
        # use the issue number string index to split the filename string
        if issue_start != 0:
            filename = filename[:issue_start]
        # in case there is no issue number, remove some obvious stuff
        if "--" in filename:
            # the pattern seems to be that anything to left of the first "--" is the series name followed by issue
            filename = re.sub("--.*", self.repl, filename)
        elif "__" in filename:
            # the pattern seems to be that anything to left of the first "__" is the series name followed by issue
            filename = re.sub("__.*", self.repl, filename)
        filename = filename.replace("+", " ")
        tmpstr = self.fixSpaces(filename, remove_dashes=False)
        series = tmpstr
        volume = ""
        #save the last word
        try:
            last_word = series.split()[-1]
        except:
            last_word = ""
        # remove any parenthetical phrases
        series = re.sub( "\(.*?\)", "", series)
        # search for volume number
        match = re.search('(.+)([vV]|[Vv][oO][Ll]\.?\s?)(\d+)\s*$', series)
        if match:
            series = match.group(1)
            volume = match.group(3)
        # if a volume wasn't found, see if the last word is a year in parentheses
        # since that's a common way to designate the volume
        if volume == "":
            #match either (YEAR), (YEAR-), or (YEAR-YEAR2)
            match = re.search("(\()(\d{4})(-(\d{4}|)|)(\))", last_word)
            if match:
                volume = match.group(2)
        series = series.strip()
        # if we don't have an issue number (issue_start==0), look
        # for hints i.e. "TPB", "one-shot", "OS", "OGN", etc that might
        # be removed to help search online
        if issue_start == 0:
            one_shot_words = [ "tpb", "os", "one-shot", "ogn", "gn" ]
            try:
                last_word = series.split()[-1]
                if last_word.lower() in one_shot_words:
                    series = series.rsplit(' ', 1)[0]
            except:
                pass
        return series, volume.strip()
    def getYear( self,filename, issue_end):
        filename = filename[issue_end:]
        year = ""
        # look for four digit number with "(" ")" or "--" around it
        match = re.search('(\(\d\d\d\d\))|(--\d\d\d\d--)', filename)
        if match:
            year = match.group()
            # remove non-numerics
            year = re.sub("[^0-9]", "", year)
        return year
    def getRemainder( self, filename, year, count, volume, issue_end ):
        #make a guess at where the the non-interesting stuff begins
        remainder = ""
        if "--" in filename:
            remainder = filename.split("--",1)[1]
        elif "__" in filename:
            remainder = filename.split("__",1)[1]
        elif issue_end != 0:
            remainder = filename[issue_end:]
        remainder = self.fixSpaces(remainder, remove_dashes=False)
        if volume != "":
            remainder = remainder.replace("Vol."+volume,"",1)
        if year != "":
            remainder = remainder.replace(year,"",1)
        if count != "":
            remainder = remainder.replace("of "+count,"",1)
        remainder = remainder.replace("()","")
        remainder = remainder.replace("  "," ")    # cleans some whitespace mess
        return remainder.strip()
    def parseFilename( self, filename ):
        # remove the path
        filename = os.path.basename(filename)
        # remove the extension
        filename = os.path.splitext(filename)[0]
        #url decode, just in case
        filename = unquote(filename)
        # sometimes archives get messed up names from too many decodings
        # often url encodings will break and leave "_28" and "_29" in place
        # of "(" and ")"  see if there are a number of these, and replace them
        if filename.count("_28") > 1 and filename.count("_29") > 1:
            filename = filename.replace("_28", "(")
            filename = filename.replace("_29", ")")
        self.issue, issue_start, issue_end = self.getIssueNumber(filename)
        self.series, self.volume = self.getSeriesName(filename, issue_start)
        # provides proper value when the filename doesn't have a issue number
        if issue_end == 0:
            issue_end=len(self.series)
        self.year = self.getYear(filename, issue_end)
        self.issue_count = self.getIssueCount(filename, issue_end)
        self.remainder = self.getRemainder( filename, self.year, self.issue_count, self.volume, issue_end )
        if self.issue != "":
            # strip off leading zeros
            self.issue = self.issue.lstrip("0")
            if self.issue == "":
                self.issue = "0"
            if self.issue[0] == ".":
                self.issue = "0" + self.issue