diff options
author | Anton Khirnov <wyskas@gmail.com> | 2009-08-11 07:27:55 +0200 |
---|---|---|
committer | Anton Khirnov <wyskas@gmail.com> | 2009-08-11 07:27:55 +0200 |
commit | 1b3a7dfad027698ee3218595e7465843add0f3eb (patch) | |
tree | 0e5de0b79686d1923cb042d8c29146218f8e4111 /nephilim/plugins/Lyrics.py | |
parent | 777f5f30bbbb0f79cea29e6705247d3f102fb458 (diff) |
Lyrics: use lxml for parsing animelyrics pages.
Diffstat (limited to 'nephilim/plugins/Lyrics.py')
-rw-r--r-- | nephilim/plugins/Lyrics.py | 20 |
1 files changed, 14 insertions, 6 deletions
diff --git a/nephilim/plugins/Lyrics.py b/nephilim/plugins/Lyrics.py index 864b130..8b57f6e 100644 --- a/nephilim/plugins/Lyrics.py +++ b/nephilim/plugins/Lyrics.py @@ -175,14 +175,22 @@ class Lyrics(Plugin): 't':'performer'}) try: #get url for lyrics - page = urllib.urlopen(url).read() - url = re.search('<a href="(.*?)".*?%s'%song.title(), page, re.IGNORECASE).group(1) + self.logger.info('Searching Animelyrics: %s.'%url) + tree = etree.HTML(urllib.urlopen(url).read()) + url = None + for elem in tree.iterfind('.//a'): + if ('href' in elem.attrib) and elem.text and (song.title() in elem.text): + url = 'http://www.animelyrics.com/%s'%elem.get('href') + print url + if not url: + return None #get lyrics - url = 'http://www.animelyrics.com%s'%url - page = urllib.urlopen(url).read() + self.logger.info('Found song URL: %s.'%url) + tree = etree.HTML(urllib.urlopen(url).read()) ret = '' - for match in re.finditer('<pre class=lyrics>(.*?)</pre>', page, re.IGNORECASE|re.DOTALL): - ret += '%s\n\n'%match.group(1) + for elem in tree.iterfind('.//pre'): + if elem.get('class') == 'lyrics': + ret += '%s\n\n'%etree.tostring(elem, method = 'text', encoding = 'utf-8') return ret except socket.error, e: self.logger.error('Error downloading lyrics from Animelyrics: %s.'%e) |