summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <wyskas@gmail.com>2009-11-20 07:58:24 +0100
committerAnton Khirnov <wyskas@gmail.com>2009-11-21 07:03:16 +0100
commitd3b9598cb75aee28b10d46f7ed58607d738bc98e (patch)
tree9caeabb60ea6c2123c836a93470254f0dd5d9f7f
parent965494cf20b170b9916c2444b037250c7ae4764b (diff)
Lyrics: fix lyricwiki. yet again.
the really should stop doing this. switched to using regexps, because QXmlStreamReader chokes on their invalid pages and i don't want to introduce more dependencies.
-rw-r--r--nephilim/common.py21
-rw-r--r--nephilim/plugins/Lyrics.py50
2 files changed, 38 insertions, 33 deletions
diff --git a/nephilim/common.py b/nephilim/common.py
index 2136f29..334ced9 100644
--- a/nephilim/common.py
+++ b/nephilim/common.py
@@ -21,6 +21,7 @@ import socket
import logging
import os
import re
+from htmlentitydefs import name2codepoint as n2cp
socket.setdefaulttimeout(8)
@@ -86,6 +87,24 @@ def generate_metadata_path(song, dir_tag, file_tag):
return dirname, filepath
+def substitute_entity(match):
+ ent = match.group(3)
+ if match.group(1) == "#":
+ if match.group(2) == '':
+ return unichr(int(ent))
+ elif match.group(2) == 'x':
+ return unichr(int('0x'+ent, 16))
+ else:
+ cp = n2cp.get(ent)
+ if cp:
+ return unichr(cp)
+ else:
+ return match.group()
+
+def decode_htmlentities(string):
+ entity_re = re.compile(r'&(#?)(x?)(\w+);')
+ return entity_re.subn(substitute_entity, string)[0]
+
class MetadataFetcher(QtCore.QObject):
"""A basic class for metadata fetchers. Provides a fetch(song) function,
emits a finished(song, metadata) signal when done; lyrics is either a Python
@@ -115,7 +134,7 @@ class MetadataFetcher(QtCore.QObject):
self.abort()
self.song = song
- self.logger.info('Searching %s: %s.'%(self. name, url))
+ self.logger.info('Searching %s: %s.'%(self. name, url.toString()))
self.rep = self.nam.get(QtNetwork.QNetworkRequest(url))
self.rep.error.connect(self.handle_error)
diff --git a/nephilim/plugins/Lyrics.py b/nephilim/plugins/Lyrics.py
index af54f0a..d558165 100644
--- a/nephilim/plugins/Lyrics.py
+++ b/nephilim/plugins/Lyrics.py
@@ -19,6 +19,7 @@ from PyQt4 import QtGui, QtCore, QtNetwork
from PyQt4.QtCore import QVariant
import os
+import re
from lxml import etree
from ..plugin import Plugin
@@ -166,6 +167,7 @@ class Lyrics(Plugin):
if not artist:
self.logger.info('Didn\'t find artist in %s artist search results.'%self.name)
return self.finish()
+ self.logger.info('Found artist: %s'%artist)
url = QtCore.QUrl(self.__apiaddress)
url.setQueryItems([('action', 'lyrics'), ('func', 'getSong'), ('artist', artist),
@@ -176,50 +178,34 @@ class Lyrics(Plugin):
def __handle_search_res(self):
url = None
- xml = QtCore.QXmlStreamReader(self.rep)
- while not xml.atEnd():
- token = xml.readNext()
- if token == QtCore.QXmlStreamReader.StartElement:
- if xml.name() == 'url':
- text = xml.readElementText()
- if text and not 'action=edit' in text:
- url = QtCore.QUrl() # the url is already percent-encoded
- url.setEncodedUrl(text)
- if xml.hasError():
- self.logger.error('Error parsing seach results: %s'%xml.errorString())
+
+ # the page is borked utf-8 as of nov 2009, qxmlstreamreader chokes
+ # on it => use regexps
+ match = re.search('<url>(.*)</url>', str(self.rep.readAll()).decode('utf-8', 'replace'),
+ re.DOTALL|re.IGNORECASE)
+ if match and not 'action=edit' in match.group(1):
+ url = QtCore.QUrl() # the url is already percent-encoded
+ url.setEncodedUrl(match.group(1))
if not url:
self.logger.info('Didn\'t find the song on Lyricwiki.')
return self.finish()
- self.logger.info('Found Lyricwiki song URL: %s.'%url)
+ self.logger.info('Found Lyricwiki song URL: %s.'%url.toString())
- # XXX temporary hack to work around lyricwiki.org -> lyrics.wikia.org transition
- if not url.path().startswith('/lyrics'):
- url.setPath('/lyrics%s'%url.path())
req = QtNetwork.QNetworkRequest(url)
self.rep = self.nam.get(req)
self.rep.finished.connect(self.__handle_lyrics)
self.rep.error.connect(self.handle_error)
def __handle_lyrics(self):
+ # the page isn't valid xml, so use regexps
lyrics = ''
- xml = QtCore.QXmlStreamReader(self.rep)
- while not xml.atEnd():
- token = xml.readNext()
- if token == QtCore.QXmlStreamReader.StartElement:
- if xml.name() == 'div' and xml.attributes().value('class') == 'lyricbox':
- while not xml.atEnd():
- token = xml.readNext()
- if token == QtCore.QXmlStreamReader.EndElement and xml.name() == 'div':
- break
- elif token == QtCore.QXmlStreamReader.StartElement and xml.name() == 'br':
- lyrics += '\n'
- elif token == QtCore.QXmlStreamReader.Characters:
- lyrics += xml.text()
- if xml.hasError():
- self.logger.warning('Error parsing lyrics: %s'%xml.errorString())
-
- self.finish(lyrics)
+ for it in re.finditer('<div class=\'lyricbox\'>(?:<div.*?>.*?</div>)?(.*?)(?:<div.*?>.*?</div>)?</div>',
+ str(self.rep.readAll()).decode('utf-8'), re.DOTALL):
+ gr = re.sub('<br />', '\n', it.group(1))
+ gr = re.sub(re.compile('<.*>', re.DOTALL), '', gr)
+ lyrics += gr + '\n'
+ self.finish(common.decode_htmlentities(lyrics))
class FetchAnimelyrics(common.MetadataFetcher):
name = 'Animelyrics'