From d3b9598cb75aee28b10d46f7ed58607d738bc98e Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Fri, 20 Nov 2009 07:58:24 +0100 Subject: Lyrics: fix lyricwiki. yet again. the really should stop doing this. switched to using regexps, because QXmlStreamReader chokes on their invalid pages and i don't want to introduce more dependencies. --- nephilim/common.py | 21 ++++++++++++++++++- nephilim/plugins/Lyrics.py | 50 +++++++++++++++++----------------------------- 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/nephilim/common.py b/nephilim/common.py index 2136f29..334ced9 100644 --- a/nephilim/common.py +++ b/nephilim/common.py @@ -21,6 +21,7 @@ import socket import logging import os import re +from htmlentitydefs import name2codepoint as n2cp socket.setdefaulttimeout(8) @@ -86,6 +87,24 @@ def generate_metadata_path(song, dir_tag, file_tag): return dirname, filepath +def substitute_entity(match): + ent = match.group(3) + if match.group(1) == "#": + if match.group(2) == '': + return unichr(int(ent)) + elif match.group(2) == 'x': + return unichr(int('0x'+ent, 16)) + else: + cp = n2cp.get(ent) + if cp: + return unichr(cp) + else: + return match.group() + +def decode_htmlentities(string): + entity_re = re.compile(r'&(#?)(x?)(\w+);') + return entity_re.subn(substitute_entity, string)[0] + class MetadataFetcher(QtCore.QObject): """A basic class for metadata fetchers. Provides a fetch(song) function, emits a finished(song, metadata) signal when done; lyrics is either a Python @@ -115,7 +134,7 @@ class MetadataFetcher(QtCore.QObject): self.abort() self.song = song - self.logger.info('Searching %s: %s.'%(self. name, url)) + self.logger.info('Searching %s: %s.'%(self. name, url.toString())) self.rep = self.nam.get(QtNetwork.QNetworkRequest(url)) self.rep.error.connect(self.handle_error) diff --git a/nephilim/plugins/Lyrics.py b/nephilim/plugins/Lyrics.py index af54f0a..d558165 100644 --- a/nephilim/plugins/Lyrics.py +++ b/nephilim/plugins/Lyrics.py @@ -19,6 +19,7 @@ from PyQt4 import QtGui, QtCore, QtNetwork from PyQt4.QtCore import QVariant import os +import re from lxml import etree from ..plugin import Plugin @@ -166,6 +167,7 @@ class Lyrics(Plugin): if not artist: self.logger.info('Didn\'t find artist in %s artist search results.'%self.name) return self.finish() + self.logger.info('Found artist: %s'%artist) url = QtCore.QUrl(self.__apiaddress) url.setQueryItems([('action', 'lyrics'), ('func', 'getSong'), ('artist', artist), @@ -176,50 +178,34 @@ class Lyrics(Plugin): def __handle_search_res(self): url = None - xml = QtCore.QXmlStreamReader(self.rep) - while not xml.atEnd(): - token = xml.readNext() - if token == QtCore.QXmlStreamReader.StartElement: - if xml.name() == 'url': - text = xml.readElementText() - if text and not 'action=edit' in text: - url = QtCore.QUrl() # the url is already percent-encoded - url.setEncodedUrl(text) - if xml.hasError(): - self.logger.error('Error parsing seach results: %s'%xml.errorString()) + + # the page is borked utf-8 as of nov 2009, qxmlstreamreader chokes + # on it => use regexps + match = re.search('(.*)', str(self.rep.readAll()).decode('utf-8', 'replace'), + re.DOTALL|re.IGNORECASE) + if match and not 'action=edit' in match.group(1): + url = QtCore.QUrl() # the url is already percent-encoded + url.setEncodedUrl(match.group(1)) if not url: self.logger.info('Didn\'t find the song on Lyricwiki.') return self.finish() - self.logger.info('Found Lyricwiki song URL: %s.'%url) + self.logger.info('Found Lyricwiki song URL: %s.'%url.toString()) - # XXX temporary hack to work around lyricwiki.org -> lyrics.wikia.org transition - if not url.path().startswith('/lyrics'): - url.setPath('/lyrics%s'%url.path()) req = QtNetwork.QNetworkRequest(url) self.rep = self.nam.get(req) self.rep.finished.connect(self.__handle_lyrics) self.rep.error.connect(self.handle_error) def __handle_lyrics(self): + # the page isn't valid xml, so use regexps lyrics = '' - xml = QtCore.QXmlStreamReader(self.rep) - while not xml.atEnd(): - token = xml.readNext() - if token == QtCore.QXmlStreamReader.StartElement: - if xml.name() == 'div' and xml.attributes().value('class') == 'lyricbox': - while not xml.atEnd(): - token = xml.readNext() - if token == QtCore.QXmlStreamReader.EndElement and xml.name() == 'div': - break - elif token == QtCore.QXmlStreamReader.StartElement and xml.name() == 'br': - lyrics += '\n' - elif token == QtCore.QXmlStreamReader.Characters: - lyrics += xml.text() - if xml.hasError(): - self.logger.warning('Error parsing lyrics: %s'%xml.errorString()) - - self.finish(lyrics) + for it in re.finditer('
(?:.*?
)?(.*?)(?:.*?)?', + str(self.rep.readAll()).decode('utf-8'), re.DOTALL): + gr = re.sub('
', '\n', it.group(1)) + gr = re.sub(re.compile('<.*>', re.DOTALL), '', gr) + lyrics += gr + '\n' + self.finish(common.decode_htmlentities(lyrics)) class FetchAnimelyrics(common.MetadataFetcher): name = 'Animelyrics' -- cgit v1.2.3