summaryrefslogtreecommitdiff
path: root/misc.py
diff options
context:
space:
mode:
authorjerous <jerous@gmail.com>2008-11-04 21:30:40 +0100
committerjerous <jerous@gmail.com>2008-11-04 21:30:40 +0100
commit6e61921c5dad2abef348b3874057fbd925bafe61 (patch)
treeaaad9600d94013db9178a62adf20ca4dc7b137ee /misc.py
parentc6d056d2c48b419b8e5e0cb1df17131df481bbaf (diff)
Lyrics: discard HTML fetched from internet
Diffstat (limited to 'misc.py')
-rw-r--r--misc.py25
1 files changed, 18 insertions, 7 deletions
diff --git a/misc.py b/misc.py
index 962f40a..168c2ba 100644
--- a/misc.py
+++ b/misc.py
@@ -39,12 +39,14 @@ def toAscii(ustr):
return ustr
return unicodedata.normalize('NFKD', ustr).encode('ascii', 'ignore')
-def fetch(SE, sites, song=None, xtra_tags={}):
+def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True):
"""Returns None when nothing found, or [site,source-url]."""
+ # compose the search-engine URL
f=format.compile(SE)
SE_url=toAscii(f(format.params(song, xtra_tags)))
SE_url=SE_url.replace(' ', '+')
+ # fetch the page from the search-engine with the results
request=urllib2.Request(SE_url)
request.add_header('User-Agent', 'montypc')
opener=urllib2.build_opener()
@@ -54,13 +56,15 @@ def fetch(SE, sites, song=None, xtra_tags={}):
regex=re.compile('<a href="(.*?)".*?>.*?<\/a>')
urls=regex.findall(data)
- # look for predefined urls, which are good lyrics-sites
- # we assume they are in order of importance; the first one matching
- # is taken
+ # look for urls which are defined in $sites.
+ # The first matching URL is taken
finalRegex=None
log.debug("Checking %i URLs on %s"%(len(sites), SE_url))
+ # loop over all sites which may have what we're interested in
for site in sites:
finalURL=None
+ finalRegex=None
+ # check if on the results-page there is a link to $site
for url in urls:
if url.find(site)>=0:
log.debug(" Found site %s in results: %s"%(site, url))
@@ -68,14 +72,17 @@ def fetch(SE, sites, song=None, xtra_tags={}):
finalRegex=sites[site]
break
- match=None
if finalURL:
+ match=None
+ # open the url
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
log.debug(" Reading URL %s"%(finalURL))
try:
+ # read the page
r = opener.open(finalURL)
data=r.read()
+ # perform the regular expression
regex=re.compile(finalRegex, re.IGNORECASE|re.MULTILINE|re.DOTALL)
match=regex.search(data)
except Exception, e:
@@ -83,10 +90,14 @@ def fetch(SE, sites, song=None, xtra_tags={}):
continue
if match:
+ # if the regex matches, then we arrive here
+ # we assume the content we want is in the first group
log.debug(" Regex succeeded!")
data=match.group(1)
- data=data.replace('<br>', '<br />')
- data=data.replace('<br />', '<br />')
+ if stripHTML:
+ # do we want HTML?
+ data=re.sub('<br.*?>', '\n', data) # replace <br />'s with newline
+ data=re.sub('<[^>]*?>', '', data) # strip all other HTML
data=data.strip()
log.debug("Succeeded fetching.")
return [data,finalURL]