summaryrefslogtreecommitdiff
path: root/misc.py
diff options
context:
space:
mode:
authorjerous <jerous@gmail.com>2008-11-11 23:31:56 +0100
committerjerous <jerous@gmail.com>2008-11-11 23:31:56 +0100
commitb5c11bfd2d33ff37a0fb7ede1196699b68c0e4f1 (patch)
tree96cc9b86ab1dea732328b779e0abc1006a7891a5 /misc.py
parentaaec0f4f9c3b825c3f045377e7bd11eeeda72f64 (diff)
YAUF: yet another unicode fix: fetch charset from data retrieved from HTML (or assume default iso-8859-1) so it is always valid unicode
convert spaces to tabs for two imported methods
Diffstat (limited to 'misc.py')
-rw-r--r--misc.py32
1 files changed, 20 insertions, 12 deletions
diff --git a/misc.py b/misc.py
index dfb907c..6fe176b 100644
--- a/misc.py
+++ b/misc.py
@@ -98,7 +98,14 @@ def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True):
# if the regex matches, then we arrive here
# we assume the content we want is in the first group
log.debug(" Regex succeeded!")
+ try:
+ charset=re.compile('charset=["\']?([\w-]+)').search(data).group(1)
+ log.debug(" charset=%s"%(charset))
+ except:
+ charset='iso-8859-1'
+ log.debug(" charset not found. Assuming %s"%(charset))
data=match.group(1)
+ data=unicode(data, charset)
if stripHTML:
# do we want HTML?
data=re.sub('<br.*?>', '\n', data) # replace <br />'s with newline
@@ -118,20 +125,21 @@ def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True):
return None
def substEntity(match):
- ent = match.group(2)
- if match.group(1) == "#":
- return unichr(int(ent))
- else:
- cp = n2cp.get(ent)
-
- if cp:
- return unichr(cp)
- else:
- return match.group()
+ ent = match.group(2)
+ if match.group(1) == "#":
+ return unichr(int(ent))
+ else:
+ cp = n2cp.get(ent)
+
+ if cp:
+ return unichr(cp)
+ else:
+ return match.group()
def decodeHTMLEntities(string):
- entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
- return entity_re.subn(substEntity, string)[0]
+ # replace entities with their UTF-counterpart
+ entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
+ return entity_re.subn(substEntity, string)[0]
class Button(QtGui.QPushButton):