diff options
author | jerous <jerous@gmail.com> | 2008-11-04 21:30:40 +0100 |
---|---|---|
committer | jerous <jerous@gmail.com> | 2008-11-04 21:30:40 +0100 |
commit | 6e61921c5dad2abef348b3874057fbd925bafe61 (patch) | |
tree | aaad9600d94013db9178a62adf20ca4dc7b137ee /misc.py | |
parent | c6d056d2c48b419b8e5e0cb1df17131df481bbaf (diff) |
Lyrics: discard HTML fetched from internet
Diffstat (limited to 'misc.py')
-rw-r--r-- | misc.py | 25 |
1 files changed, 18 insertions, 7 deletions
@@ -39,12 +39,14 @@ def toAscii(ustr): return ustr return unicodedata.normalize('NFKD', ustr).encode('ascii', 'ignore') -def fetch(SE, sites, song=None, xtra_tags={}): +def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True): """Returns None when nothing found, or [site,source-url].""" + # compose the search-engine URL f=format.compile(SE) SE_url=toAscii(f(format.params(song, xtra_tags))) SE_url=SE_url.replace(' ', '+') + # fetch the page from the search-engine with the results request=urllib2.Request(SE_url) request.add_header('User-Agent', 'montypc') opener=urllib2.build_opener() @@ -54,13 +56,15 @@ def fetch(SE, sites, song=None, xtra_tags={}): regex=re.compile('<a href="(.*?)".*?>.*?<\/a>') urls=regex.findall(data) - # look for predefined urls, which are good lyrics-sites - # we assume they are in order of importance; the first one matching - # is taken + # look for urls which are defined in $sites. + # The first matching URL is taken finalRegex=None log.debug("Checking %i URLs on %s"%(len(sites), SE_url)) + # loop over all sites which may have what we're interested in for site in sites: finalURL=None + finalRegex=None + # check if on the results-page there is a link to $site for url in urls: if url.find(site)>=0: log.debug(" Found site %s in results: %s"%(site, url)) @@ -68,14 +72,17 @@ def fetch(SE, sites, song=None, xtra_tags={}): finalRegex=sites[site] break - match=None if finalURL: + match=None + # open the url cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) log.debug(" Reading URL %s"%(finalURL)) try: + # read the page r = opener.open(finalURL) data=r.read() + # perform the regular expression regex=re.compile(finalRegex, re.IGNORECASE|re.MULTILINE|re.DOTALL) match=regex.search(data) except Exception, e: @@ -83,10 +90,14 @@ def fetch(SE, sites, song=None, xtra_tags={}): continue if match: + # if the regex matches, then we arrive here + # we assume the content we want is in the first group log.debug(" Regex succeeded!") data=match.group(1) - data=data.replace('<br>', '<br />') - data=data.replace('<br />', '<br />') + if stripHTML: + # do we want HTML? + data=re.sub('<br.*?>', '\n', data) # replace <br />'s with newline + data=re.sub('<[^>]*?>', '', data) # strip all other HTML data=data.strip() log.debug("Succeeded fetching.") return [data,finalURL] |