YAUF: yet another unicode fix: fetch charset from data retrieved from HTML (or assume default iso-8859-1) so it is always valid unicode

convert spaces to tabs for two imported methods
author: jerous <jerous@gmail.com> 2008-11-11 23:31:56 +0100
committer: jerous <jerous@gmail.com> 2008-11-11 23:31:56 +0100
commit: b5c11bfd2d33ff37a0fb7ede1196699b68c0e4f1 (patch)
tree: 96cc9b86ab1dea732328b779e0abc1006a7891a5 /misc.py
parent: aaec0f4f9c3b825c3f045377e7bd11eeeda72f64 (diff)
1 files changed, 20 insertions, 12 deletions
diff --git a/misc.py b/misc.py
index dfb907c..6fe176b 100644
--- a/misc.py
+++ b/misc.py
@@ -98,7 +98,14 @@ def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True):
 				# if the regex matches, then we arrive here
 				# we assume the content we want is in the first group
 				log.debug("  Regex succeeded!")
+				try:
+					charset=re.compile('charset=["\']?([\w-]+)').search(data).group(1)
+					log.debug("  charset=%s"%(charset))
+				except:
+					charset='iso-8859-1'
+					log.debug("  charset not found. Assuming %s"%(charset))
 				data=match.group(1)
+				data=unicode(data, charset)
 				if stripHTML:
 					# do we want HTML?
 					data=re.sub('<br.*?>', '\n', data)	# replace <br />'s with newline
@@ -118,20 +125,21 @@ def fetch(SE, sites, song=None, xtra_tags={}, stripHTML=True):
 	return None
 
 def substEntity(match):
-    ent = match.group(2)
-    if match.group(1) == "#":
-        return unichr(int(ent))
-    else:
-        cp = n2cp.get(ent)
-
-        if cp:
-            return unichr(cp)
-        else:
-            return match.group()
+	ent = match.group(2)
+	if match.group(1) == "#":
+		return unichr(int(ent))
+	else:
+		cp = n2cp.get(ent)
+
+		if cp:
+			return unichr(cp)
+		else:
+			return match.group()
 
 def decodeHTMLEntities(string):
-    entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
-    return entity_re.subn(substEntity, string)[0]
+	# replace entities with their UTF-counterpart
+	entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
+	return entity_re.subn(substEntity, string)[0]
 
 		
 class Button(QtGui.QPushButton):
author	jerous <jerous@gmail.com>	2008-11-11 23:31:56 +0100
committer	jerous <jerous@gmail.com>	2008-11-11 23:31:56 +0100
commit	b5c11bfd2d33ff37a0fb7ede1196699b68c0e4f1 (patch)
tree	96cc9b86ab1dea732328b779e0abc1006a7891a5 /misc.py
parent	aaec0f4f9c3b825c3f045377e7bd11eeeda72f64 (diff)