Lyrics: use lxml for parsing animelyrics pages.

author: Anton Khirnov <wyskas@gmail.com> 2009-08-11 07:27:55 +0200
committer: Anton Khirnov <wyskas@gmail.com> 2009-08-11 07:27:55 +0200
commit: 1b3a7dfad027698ee3218595e7465843add0f3eb (patch)
tree: 0e5de0b79686d1923cb042d8c29146218f8e4111 /nephilim/plugins/Lyrics.py
parent: 777f5f30bbbb0f79cea29e6705247d3f102fb458 (diff)
1 files changed, 14 insertions, 6 deletions
diff --git a/nephilim/plugins/Lyrics.py b/nephilim/plugins/Lyrics.py
index 864b130..8b57f6e 100644
--- a/nephilim/plugins/Lyrics.py
+++ b/nephilim/plugins/Lyrics.py
@@ -175,14 +175,22 @@ class Lyrics(Plugin):
                                                                            't':'performer'})
         try:
             #get url for lyrics
-            page   = urllib.urlopen(url).read()
-            url    = re.search('<a href="(.*?)".*?%s'%song.title(), page, re.IGNORECASE).group(1)
+            self.logger.info('Searching Animelyrics: %s.'%url)
+            tree   = etree.HTML(urllib.urlopen(url).read())
+            url    = None
+            for elem in tree.iterfind('.//a'):
+                if ('href' in elem.attrib) and elem.text and (song.title() in elem.text):
+                    url = 'http://www.animelyrics.com/%s'%elem.get('href')
+            print url
+            if not url:
+                return None
             #get lyrics
-            url    = 'http://www.animelyrics.com%s'%url
-            page   = urllib.urlopen(url).read()
+            self.logger.info('Found song URL: %s.'%url)
+            tree = etree.HTML(urllib.urlopen(url).read())
             ret = ''
-            for match in re.finditer('<pre class=lyrics>(.*?)</pre>', page, re.IGNORECASE|re.DOTALL):
-                ret += '%s\n\n'%match.group(1)
+            for elem in tree.iterfind('.//pre'):
+                if elem.get('class') == 'lyrics':
+                    ret += '%s\n\n'%etree.tostring(elem, method = 'text', encoding = 'utf-8')
             return ret
         except socket.error, e:
             self.logger.error('Error downloading lyrics from Animelyrics: %s.'%e)
author	Anton Khirnov <wyskas@gmail.com>	2009-08-11 07:27:55 +0200
committer	Anton Khirnov <wyskas@gmail.com>	2009-08-11 07:27:55 +0200
commit	1b3a7dfad027698ee3218595e7465843add0f3eb (patch)
tree	0e5de0b79686d1923cb042d8c29146218f8e4111 /nephilim/plugins/Lyrics.py
parent	777f5f30bbbb0f79cea29e6705247d3f102fb458 (diff)