From 128af4521b72919a8917f5108d5b8982af148549 Mon Sep 17 00:00:00 2001
From: "beville@gmail.com"
 <beville@gmail.com@6c5673fe-1810-88d6-992b-cd32ca31540c>
Date: Thu, 2 May 2013 16:31:50 +0000
Subject: [PATCH] better filename parsing

git-svn-id: http://comictagger.googlecode.com/svn/trunk@623 6c5673fe-1810-88d6-992b-cd32ca31540c
---
 comictaggerlib/filenameparser.py | 186 ++++++++++++++-----------------
 1 file changed, 85 insertions(+), 101 deletions(-)

diff --git a/comictaggerlib/filenameparser.py b/comictaggerlib/filenameparser.py
index 02b6ad3..0ea59e7 100644
--- a/comictaggerlib/filenameparser.py
+++ b/comictaggerlib/filenameparser.py
@@ -30,14 +30,18 @@ import os
 from urllib import unquote
 
 class FileNameParser:
+
+	def repl(self, m):
+	   return ' ' * len(m.group())
+	
 	def fixSpaces( self, string, remove_dashes=True ):
 		if remove_dashes:
 			placeholders = ['[-_]','  +']
 		else:
 			placeholders = ['[_]','  +']			
 		for ph in placeholders:
-			string = re.sub(ph, ' ', string )
-		return string.strip()
+			string = re.sub(ph, self.repl, string )
+		return string #.strip()
 
 	# check for silly .1 or .5 style issue strings
 	# allow up to 5 chars total
@@ -74,138 +78,118 @@ class FileNameParser:
 		count = count.lstrip("0")
 
 		return count
-		
-		
+	
 	def getIssueNumber( self, filename ):
 
+		# Returns a tuple of issue number string, and start and end indexs in the filename
+		# (The indexes will be used to split the string up for further parsing)
+		
 		found = False
 		issue = ''
-		original_filename = filename
+		start = 0
+		end = 0
 		
 		# first, look for multiple "--", this means it's formatted differently from most:
 		if "--" in filename:
 			# the pattern seems to be that anything to left of the first "--" is the series name followed by issue
-			filename = filename.split("--")[0]
-		elif "___" in filename:
+			filename = re.sub("--.*", self.repl, filename)	
+			
+		elif "__" in filename:
 			# the pattern seems to be that anything to left of the first "__" is the series name followed by issue
-			filename = filename.split("__")[0]
+			filename = re.sub("__.*", self.repl, filename)	
 
 		filename = filename.replace("+", " ")
 			
-		# remove parenthetical phrases
-		filename = re.sub( "\(.*?\)", "", filename)
-		filename = re.sub( "\[.*?\]", "", filename)
-		
-		# guess based on position
+		# replace parenthetical phrases with spaces
+		filename = re.sub( "\(.*?\)", self.repl, filename)
+		filename = re.sub( "\[.*?\]", self.repl, filename)
 
 		# replace any name seperators with spaces
-		tmpstr = self.fixSpaces(filename)
-		word_list = tmpstr.split(' ')
+		filename = self.fixSpaces(filename)
+
+		# remove any "of NN" phrase with spaces (problem: this might break some titles)
+		filename = re.sub( "of [\d]+", self.repl, filename)
+
+		print u"[{0}]".format(filename)
 		
-		#before we search, remove any kind of likely "of X" phrase
-		for i in range(0, len(word_list)-2):
-			if ( word_list[i].isdigit() and
-				word_list[i+1] == "of"  and
-				word_list[i+2].isdigit() ):
-				word_list[i+1] ="XXX"
-				word_list[i+2] ="XXX"
-				
-				
-		# first look for the last "#" followed by a digit in the filename. this is almost certainly the issue number
-		#issnum = re.search('#\d+', filename)
-		matchlist = re.findall("#[-+]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", filename)
-		if len(matchlist) > 0:
-			#get the last item
-			issue = matchlist[ len(matchlist) - 1][0]
-			print 'Assuming issue number is ' + str(issue) + ' based on first test.'
-
-			found = True
-
-		# assume the last number in the filename that is under 4 digits is the issue number
-		if not found:
-			for word in reversed(word_list):
-				if len(word) > 0 and word[0] == "#":
-					word = word[1:]
-				if ( 
-					 (word.isdigit() and len(word) < 4) or
-					 (self.isPointIssue(word))
-					):
-					issue = word
-					found = True
-					print 'Assuming issue number is ' + str(issue) + ' based on the position.'
-					break
-
-		if not found:
-			# try a regex
-			#issnum = re.search('(?<=[_#\s-])(\d+[a-zA-Z]+|\d+\.\d|\d+)', filename)
-			issnum = re.search('(?<=[_#\s-])(\d+[^\d]+|\d+\.\d|\d+)', filename)
-			if issnum:
-				issue = issnum.group()
+		# we should now have a cleaned up filename version with all the words in
+		# the same positions as original filename
+			
+		# make a list of each word and its position
+		word_list = list()
+		for m in re.finditer("\S+", filename):
+			word_list.append( (m.group(0), m.start(), m.end()) )
+					
+		# Now try to search for the likely issue number word in the list
+		
+		# first look for a word with "#" followed by digits with optional sufix 
+		# this is almost certainly the issue number
+		for w in reversed(word_list):
+			if re.match("#[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]):
 				found = True
-				print 'Got the issue using regex. Issue is ' + issue
-			
-		
-		# take a stab at working out the span of the issue subtring in the original
-		# (this should really be done which each search, so we're not just always guessing)
-		if found:
-			cnt = 0
-			print "issue str = [{0}], {1}".format(issue, original_filename)
-			span = None
-			pattern = "\()"
-			for g in re.finditer(issue, original_filename):
-				#print g.span()
-				cnt += 1
-				if cnt > 1:
+				break
+
+		# same as above but w/o a '#' 		
+		if not found:
+			for w in reversed(word_list):
+				if re.match("[-]?(([0-9]*\.[0-9]+|[0-9]+)(\w*))", w[0]):
+					found = True
 					break
-			else:
-				if cnt == 1:
-					span = g.span()
-			print span
-			
-		issue = issue.strip()			
+				
+		# now try to look for a # followed by any characters		
+		if not found:
+			for w in reversed(word_list):
+				if re.match("#\w+", w[0]):
+					found = True
+					break
+				
+		if found:
+			issue = w[0]
+			start = w[1]
+			end = w[2]
+			if issue[0] == '#':
+				issue = issue[1:]
+				
+		return issue, start, end
 		
-		return issue
+	def getSeriesName(self, filename, issue_start ):
 
-	def getSeriesName(self, filename, issue ):
-
-		# use the issue number string to split the filename string
-		# assume first element of list is the series name, plus cruft
-		#!!! this could fail in the case of small numerics in the series name!!!
-
-		# TODO:  we really should pass in the *INDEX* of the issue, that makes 
-		# finding it easier
+		# use the issue number string index to split the filename string
+		
+		filename = filename[:issue_start-1]
 		
 		filename = filename.replace("+", " ")
 		tmpstr = self.fixSpaces(filename, remove_dashes=False)
 		
-		#remove pound signs.  this might mess up the series name if there is a# in it.
-		tmpstr = tmpstr.replace("#", " ")
-		
-		if issue != "":	
-			# assume that issue substr has at least one space before it
-			issue_str = " " + str(issue)
-			series = tmpstr.split(issue_str)[0]
-		else:
-			# no issue to work off of
-			#!!! TODO we should look for the year, and split from that
-			series = tmpstr
-			
+		series = tmpstr	
 		volume = ""
+
+		#save the last word		
+		last_word = series.split()[-1]
 		
 		# remove any parenthetical phrases
 		series = re.sub( "\(.*?\)", "", series)
-
-		series = series.rstrip("#")
 			
 		# search for volume number
 		match = re.search('(.+)([vV]|[Vv][oO][Ll]\.?\s?)(\d+)\s*$', series)
 		if match:
 			series = match.group(1)
 			volume = match.group(3)
+			
+		# if a volume wasn't found, see if the last word is a year in parentheses
+		# since that's a common way to designate the volume
+		if volume == "":
+			#match either (YEAR), (YEAR-), or (YEAR-YEAR2)
+			match = re.search("(\()(\d{4})(-(\d{4}|)|)(\))", last_word)
+			if match:
+				volume = match.group(2)
+
 		
 		return series.strip(), volume.strip()
 
-	def getYear( self,filename):
+	def getYear( self,filename, issue_end):
+
+		filename = filename[issue_end:]
 
 		year = ""
 		# look for four digit number with "(" ")" or "--" around it
@@ -256,9 +240,9 @@ class FileNameParser:
 			filename = filename.replace("_28", "(")
 			filename = filename.replace("_29", ")")
 					
-		self.issue = self.getIssueNumber(filename)
-		self.series, self.volume = self.getSeriesName(filename, self.issue)
-		self.year = self.getYear(filename)
+		self.issue, issue_start, issue_end = self.getIssueNumber(filename)
+		self.series, self.volume = self.getSeriesName(filename, issue_start)
+		self.year = self.getYear(filename, issue_end)
 		self.issue_count = self.getIssueCount(filename)
 		self.remainder = self.getRemainder( filename, self.year, self.issue_count )