diff options
author | jerous <jerous@gmail.com> | 2008-11-11 23:31:56 +0100 |
---|---|---|
committer | jerous <jerous@gmail.com> | 2008-11-11 23:31:56 +0100 |
commit | b5c11bfd2d33ff37a0fb7ede1196699b68c0e4f1 (patch) | |
tree | 96cc9b86ab1dea732328b779e0abc1006a7891a5 /misc.py | |
parent | aaec0f4f9c3b825c3f045377e7bd11eeeda72f64 (diff) |
YAUF: yet another unicode fix: fetch charset from data retrieved from HTML (or assume default iso-8859-1) so it is always valid unicode
convert spaces to tabs for two imported methods
Diffstat (limited to 'misc.py')
-rw-r--r-- | misc.py | 32 |
1 files changed, 20 insertions, 12 deletions
@@ -98,7 +98,14 @@ def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True): # if the regex matches, then we arrive here # we assume the content we want is in the first group log.debug(" Regex succeeded!") + try: + charset=re.compile('charset=["\']?([\w-]+)').search(data).group(1) + log.debug(" charset=%s"%(charset)) + except: + charset='iso-8859-1' + log.debug(" charset not found. Assuming %s"%(charset)) data=match.group(1) + data=unicode(data, charset) if stripHTML: # do we want HTML? data=re.sub('<br.*?>', '\n', data) # replace <br />'s with newline @@ -118,20 +125,21 @@ def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True): return None def substEntity(match): - ent = match.group(2) - if match.group(1) == "#": - return unichr(int(ent)) - else: - cp = n2cp.get(ent) - - if cp: - return unichr(cp) - else: - return match.group() + ent = match.group(2) + if match.group(1) == "#": + return unichr(int(ent)) + else: + cp = n2cp.get(ent) + + if cp: + return unichr(cp) + else: + return match.group() def decodeHTMLEntities(string): - entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") - return entity_re.subn(substEntity, string)[0] + # replace entities with their UTF-counterpart + entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") + return entity_re.subn(substEntity, string)[0] class Button(QtGui.QPushButton): |