Lyrics: discard HTML fetched from internet

author: jerous <jerous@gmail.com> 2008-11-04 21:30:40 +0100
committer: jerous <jerous@gmail.com> 2008-11-04 21:30:40 +0100
commit: 6e61921c5dad2abef348b3874057fbd925bafe61 (patch)
tree: aaad9600d94013db9178a62adf20ca4dc7b137ee /misc.py
parent: c6d056d2c48b419b8e5e0cb1df17131df481bbaf (diff)
1 files changed, 18 insertions, 7 deletions
diff --git a/misc.py b/misc.py
index 962f40a..168c2ba 100644
--- a/misc.py
+++ b/misc.py
@@ -39,12 +39,14 @@ def toAscii(ustr):
 		return ustr
 	return unicodedata.normalize('NFKD', ustr).encode('ascii', 'ignore')
 
-def fetch(SE, sites, song=None, xtra_tags={}):
+def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True):
 	"""Returns None when nothing found, or [site,source-url]."""
+	# compose the search-engine URL
 	f=format.compile(SE)
 	SE_url=toAscii(f(format.params(song, xtra_tags)))
 	SE_url=SE_url.replace(' ', '+')
 	
+	# fetch the page from the search-engine with the results
 	request=urllib2.Request(SE_url)
 	request.add_header('User-Agent', 'montypc')
 	opener=urllib2.build_opener()
@@ -54,13 +56,15 @@ def fetch(SE, sites, song=None, xtra_tags={}):
 	regex=re.compile('<a href="(.*?)".*?>.*?<\/a>')
 	urls=regex.findall(data)
 	
-	# look for predefined urls, which are good lyrics-sites
-	# we assume they are in order of importance; the first one matching
-	# is taken
+	# look for urls which are defined in $sites.
+	# The first matching URL is taken
 	finalRegex=None
 	log.debug("Checking %i URLs on %s"%(len(sites), SE_url))
+	# loop over all sites which may have what we're interested in
 	for site in sites:
 		finalURL=None
+		finalRegex=None
+		# check if on the results-page there is a link to $site
 		for url in urls:
 			if url.find(site)>=0:
 				log.debug("  Found site %s in results: %s"%(site, url))
@@ -68,14 +72,17 @@ def fetch(SE, sites, song=None, xtra_tags={}):
 				finalRegex=sites[site]
 				break
 
-		match=None
 		if finalURL:
+			match=None
+			# open the url
 			cj = cookielib.CookieJar()
 			opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
 			log.debug("  Reading URL %s"%(finalURL))
 			try:
+				# read the page
 				r = opener.open(finalURL)
 				data=r.read()
+				# perform the regular expression
 				regex=re.compile(finalRegex, re.IGNORECASE|re.MULTILINE|re.DOTALL)
 				match=regex.search(data)
 			except Exception, e:
@@ -83,10 +90,14 @@ def fetch(SE, sites, song=None, xtra_tags={}):
 				continue
 			
 			if match:
+				# if the regex matches, then we arrive here
+				# we assume the content we want is in the first group
 				log.debug("  Regex succeeded!")
 				data=match.group(1)
-				data=data.replace('<br>', '<br />')
-				data=data.replace('<br />', '<br />')
+				if stripHTML:
+					# do we want HTML?
+					data=re.sub('<br.*?>', '\n', data)	# replace <br />'s with newline
+					data=re.sub('<[^>]*?>', '', data) 	# strip all other HTML
 				data=data.strip()
 				log.debug("Succeeded fetching.")
 				return [data,finalURL]
author	jerous <jerous@gmail.com>	2008-11-04 21:30:40 +0100
committer	jerous <jerous@gmail.com>	2008-11-04 21:30:40 +0100
commit	6e61921c5dad2abef348b3874057fbd925bafe61 (patch)
tree	aaad9600d94013db9178a62adf20ca4dc7b137ee /misc.py
parent	c6d056d2c48b419b8e5e0cb1df17131df481bbaf (diff)