summaryrefslogtreecommitdiff
path: root/misc.py
diff options
context:
space:
mode:
authorjerous <jerous@gmail.com>2008-11-04 22:09:59 +0100
committerjerous <jerous@gmail.com>2008-11-04 22:09:59 +0100
commita17f20f6399033af174e6c4d7240e4481e822bc8 (patch)
treed5cfd0d5b2f7edbf2f87d6188d60d94f400427e6 /misc.py
parent46c7d6ae826d719bd7c921813b870940c368c473 (diff)
decode HTML entities when fetching from internet
Diffstat (limited to 'misc.py')
-rw-r--r--misc.py20
1 files changed, 20 insertions, 0 deletions
diff --git a/misc.py b/misc.py
index 168c2ba..a1a48f2 100644
--- a/misc.py
+++ b/misc.py
@@ -1,8 +1,10 @@
from PyQt4 import QtCore, QtGui
+from htmlentitydefs import name2codepoint as n2cp
import re
import urllib2, httplib, cookielib
import socket
import unicodedata
+import htmlentitydefs
import format
import log
@@ -98,6 +100,7 @@ def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True):
# do we want HTML?
data=re.sub('<br.*?>', '\n', data) # replace <br />'s with newline
data=re.sub('<[^>]*?>', '', data) # strip all other HTML
+ data=decodeHTMLEntities(data) # convert HTML entities
data=data.strip()
log.debug("Succeeded fetching.")
return [data,finalURL]
@@ -110,6 +113,23 @@ def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True):
log.debug("Failed fetching.")
return None
+def substEntity(match):
+ ent = match.group(2)
+ if match.group(1) == "#":
+ return unichr(int(ent))
+ else:
+ cp = n2cp.get(ent)
+
+ if cp:
+ return unichr(cp)
+ else:
+ return match.group()
+
+def decodeHTMLEntities(string):
+ entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
+ return entity_re.subn(substEntity, string)[0]
+
+
class Button(QtGui.QPushButton):
iconSize=32
"""A simple Button class which calls $onClick when clicked."""