diff options
author | jerous <jerous@gmail.com> | 2008-11-04 22:09:59 +0100 |
---|---|---|
committer | jerous <jerous@gmail.com> | 2008-11-04 22:09:59 +0100 |
commit | a17f20f6399033af174e6c4d7240e4481e822bc8 (patch) | |
tree | d5cfd0d5b2f7edbf2f87d6188d60d94f400427e6 /misc.py | |
parent | 46c7d6ae826d719bd7c921813b870940c368c473 (diff) |
decode HTML entities when fetching from internet
Diffstat (limited to 'misc.py')
-rw-r--r-- | misc.py | 20 |
1 files changed, 20 insertions, 0 deletions
@@ -1,8 +1,10 @@ from PyQt4 import QtCore, QtGui +from htmlentitydefs import name2codepoint as n2cp import re import urllib2, httplib, cookielib import socket import unicodedata +import htmlentitydefs import format import log @@ -98,6 +100,7 @@ def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True): # do we want HTML? data=re.sub('<br.*?>', '\n', data) # replace <br />'s with newline data=re.sub('<[^>]*?>', '', data) # strip all other HTML + data=decodeHTMLEntities(data) # convert HTML entities data=data.strip() log.debug("Succeeded fetching.") return [data,finalURL] @@ -110,6 +113,23 @@ def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True): log.debug("Failed fetching.") return None +def substEntity(match): + ent = match.group(2) + if match.group(1) == "#": + return unichr(int(ent)) + else: + cp = n2cp.get(ent) + + if cp: + return unichr(cp) + else: + return match.group() + +def decodeHTMLEntities(string): + entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") + return entity_re.subn(substEntity, string)[0] + + class Button(QtGui.QPushButton): iconSize=32 """A simple Button class which calls $onClick when clicked.""" |