Better attribute handling. Factored out tag handling into getTags()

turbodog · turbodog · commit a55809611f97 · 2011-06-24T11:07:22.000-07:00
diff --git a/rss2email.py b/rss2email.py
@@ -24,6 +24,7 @@
                      "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ]
 
 import urllib2
+import BeautifulSoup
 urllib2.install_opener(urllib2.build_opener())
 
 ### Vaguely Customizable Options ###
@@ -382,7 +383,8 @@ def getContent(entry, HTMLOK=0):
 		if not HTMLOK: # Only need to convert to text if HTML isn't OK
 			for c in conts:
 				if contains(c.type, 'html'):
-					return html2text(c.value)
+					cleanerhtml = BeautifulSoup.BeautifulSoup(c.value)
+					return html2text(unicode(cleanerhtml))
 		
 		for c in conts:
 			if c.type == 'text/plain': return c.value
@@ -392,7 +394,8 @@ def getContent(entry, HTMLOK=0):
 	return ""
 
 def getID(entry):
-	"""Get best ID from an entry."""
+	"""Get best ID from an entry.
+	NEEDS UNIT TESTS"""
 	if TRUST_GUID:
 		if 'id' in entry and entry.id: 
 			# Newer versions of feedparser could return a dictionary
@@ -406,17 +409,17 @@ def getID(entry):
 	if 'link' in entry: return entry.link
 	if 'title' in entry: return hash(unu(entry.title)).hexdigest()
 
-def getName(r, entry):
+def getName(fullfeed, entry):
 	"""Get the best name.
 	NEEDS UNIT TESTS"""
 
 	if NO_FRIENDLY_NAME: return ''
 
-	feed = r.feed
-	if hasattr(r, "url") and r.url in OVERRIDE_FROM.keys():
-		return OVERRIDE_FROM[r.url]
+	feedinfo = fullfeed.feed
+	if hasattr(fullfeed, "url") and fullfeed.url in OVERRIDE_FROM.keys():
+		return OVERRIDE_FROM[fullfeed.url]
 	
-	name = feed.get('title', '')
+	name = feedinfo.get('title', '')
 
 	if 'name' in entry.get('author_detail', []): # normally {} but py2.1
 		if entry.author_detail.name:
@@ -427,10 +430,10 @@ def getName(r, entry):
 			except UnicodeDecodeError:
 			    name +=  unicode(entry.author_detail.name, 'utf-8')
 
-	elif 'name' in feed.get('author_detail', []):
-		if feed.author_detail.name:
+	elif 'name' in feedinfo.get('author_detail', []):
+		if feedinfo.author_detail.name:
 			if name: name += ", "
-			name += feed.author_detail.name
+			name += feedinfo.author_detail.name
 	
 	return name
 
@@ -469,6 +472,21 @@ def getEmail(r, entry):
 		return DEFAULT_EMAIL[r.url]
 	return DEFAULT_FROM
 
+def getTags(entry):
+	"""If the entry has any tags, build a tagline and return as a string. Otherwise returns empty string"""
+	tagline = ""
+	if 'tags' in entry:
+		tags = entry.get('tags')
+		taglist = []
+		if tags:
+			for tag in tags:
+				if tag.has_key('term'): taglist.append(tag['term'])
+		if taglist:
+			tagline = ",".join(taglist)
+
+	return tagline
+	
+
 ### Simple Database of Feeds ###
 
 class Feed:
@@ -689,16 +707,8 @@ def run(num=None):
 					useragenthdr = "rss2email"
 					
 					# Add post tags, if available
-					tagline = ""
-					if 'tags' in entry:
-						tags = entry.get('tags')
-						taglist = []
-						if tags:
-							for tag in tags:
-								taglist.append(tag['term'])
-						if taglist:
-							tagline = ",".join(taglist)
-					
+					tagline = getTags(entry)
+
 					extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline}
 					if BONUS_HEADER != '':
 						for hdr in BONUS_HEADER.strip().splitlines():