Miscellaneous old utils.

2025-09-18 21:43:31 +00:00 · 2016-08-04 11:00:33 -07:00 · 2016-08-04 11:00:33 -07:00 · 840bb43fc2
commit 840bb43fc2
parent 7e7aae0967
2 changed files with 187 additions and 0 deletions
--- a/utils/green.py
+++ b/utils/green.py
@ -0,0 +1,36 @@
+from gevent import monkey
+monkey.patch_socket()
+
+from newsblur.utils import feedparser
+import gevent
+from gevent import queue
+import urllib2
+
+def fetch_title(url):
+    print "Running %s" % url
+    data = urllib2.urlopen(url).read()
+    print "Parsing %s" % url
+    d = feedparser.parse(data)
+    print "Parsed %s" % d.feed.get('title', '')
+    return d.feed.get('title', '')
+
+def worker():
+    while True:
+        url = q.get()
+        try:
+            fetch_title(url)
+        finally:
+            q.task_done()
+
+if __name__ == '__main__':
+    q = queue.JoinableQueue()
+    for i in range(5):
+         gevent.spawn(worker)
+
+    for url in "http://www.43folders.com/rss.xml/nhttp://feeds.feedburner.com/43folders/nhttp://www.43folders.com/rss.xml/nhttp://feeds.feedburner.com/43folders/nhttp://feeds.feedburner.com/AMinuteWithBrendan/nhttp://feeds.feedburner.com/AMinuteWithBrendan/nhttp://www.asianart.org/feeds/Lectures,Classes,Symposia.xml/nhttp://www.asianart.org/feeds/Performances.xml/nhttp://feeds.feedburner.com/ajaxian/nhttp://ajaxian.com/index.xml/nhttp://al3x.net/atom.xml/nhttp://feeds.feedburner.com/AmericanDrink/nhttp://feeds.feedburner.com/eod_full/nhttp://feeds.feedburner.com/typepad/notes/nhttp://feeds.dashes.com/AnilDash/nhttp://rss.sciam.com/assignment-impossible/feed/nhttp://blogs.scientificamerican.com/assignment-impossible//nhttp://feeds.feedburner.com/Beautiful-Pixels/nhttp://feeds.feedburner.com/Beautiful-Pixels/nhttp://www.betabeat.com/feed/".split('/n'):
+            print "Spawning: %s" % url
+            q.put(url)
+
+    q.join()  # block until all tasks are done
+
+
--- a/utils/knight.py
+++ b/utils/knight.py
@ -0,0 +1,151 @@
+# Screen scrapes the Knight News Challenge entries (all 64 pages of them)
+# and counts the number of votes/hearts for each entry. Then displays them
+# in rank order.
+# 
+# This script runs in about 20 seconds.
+
+import requests
+from BeautifulSoup import BeautifulSoup
+
+# Winners found on http://newschallenge.tumblr.com/post/20962258701/knight-news-challenge-on-networks-moving-to-the-next:
+# 
+#     $('.posts .MsoNormal > span').find('a[href^="http://newschallenge.tumblr.com/post"]').map(function() { 
+#         return $(this).attr('href');
+#     });
+
+winners = [
+    "http://newschallenge.tumblr.com/post/20962258701/knight-news-challenge-on-networks-moving-to-the-next#disqus_thread",
+    "http://newschallenge.tumblr.com/post/19436493313/amauta-a-collaborative-media-network",
+    "http://newschallenge.tumblr.com/post/19493987224/cont3nt-com-lets-you-sell-media-in-real-time-via",
+    "http://newschallenge.tumblr.com/post/19494011127/expand-the-unconsumption-project",
+    "http://newschallenge.tumblr.com/post/19493557384/mediareputations-com-verifies-your-credentials-and",
+    "http://newschallenge.tumblr.com/post/19438230966/prescouter-storify-meets-wikipedia",
+    "http://newschallenge.tumblr.com/post/20968613548/themes-surprises-and-outliers-from-1000",
+    "http://newschallenge.tumblr.com/post/19436493313/amauta-a-collaborative-media-network",
+    "http://newschallenge.tumblr.com/post/19436676620/filling-foreign-news-gaps-with-scholars-asia-beat",
+    "http://newschallenge.tumblr.com/post/19478851354/bridging-the-big-data-digital-divide-information",
+    "http://newschallenge.tumblr.com/post/19492881188/1-what-do-you-propose-to-do-20-words-scale",
+    "http://newschallenge.tumblr.com/post/19121005017/citjo-connecting-twitter-users-with-media-buyers",
+    "http://newschallenge.tumblr.com/post/19479493999/connecting-the-global-hacks-hackers-network",
+    "http://newschallenge.tumblr.com/post/19436607188/connecting-the-world-with-rural-india-via-facebook-and",
+    "http://newschallenge.tumblr.com/post/19493987224/cont3nt-com-lets-you-sell-media-in-real-time-via",
+    "http://newschallenge.tumblr.com/post/19438970667/the-cowbird-community-reporting-project",
+    "http://newschallenge.tumblr.com/post/19450699629/new-contribution-tools-for-openstreetmap",
+    "http://newschallenge.tumblr.com/post/19479653130/differentfeather",
+    "http://newschallenge.tumblr.com/post/19478834324/diy-drone-fleets-for-airborne-web-journalism",
+    "http://newschallenge.tumblr.com/post/19483270689/docs-to-wordpress-to-indesign",
+    "http://newschallenge.tumblr.com/post/19477903682/electoral-college-of-me",
+    "http://newschallenge.tumblr.com/post/19404846313/envirofact",
+    "http://newschallenge.tumblr.com/post/19490695157/funf-org-open-mobile-sensing",
+    "http://newschallenge.tumblr.com/post/19490695157/funf-org-open-mobile-sensing",
+    "http://newschallenge.tumblr.com/post/19419901491/global-censorship-monitoring-system",
+    "http://newschallenge.tumblr.com/post/19065611908/a-google-news-for-the-social-web",
+    "http://newschallenge.tumblr.com/post/19438785842/hawaii-eco-net",
+    "http://newschallenge.tumblr.com/post/19180046026/hypothes-is-an-annotation-layer-for-the-web",
+    "http://newschallenge.tumblr.com/post/19479029243/iava-new-gi-bill-veterans-alumni-network-vets",
+    "http://newschallenge.tumblr.com/post/19436574450/m-health-news-network",
+    "http://newschallenge.tumblr.com/post/19493557384/mediareputations-com-verifies-your-credentials-and",
+    "http://newschallenge.tumblr.com/post/19480304461/mesh-potato-2-0",
+    "http://newschallenge.tumblr.com/post/19479664924/mobile-publishing-for-everyone",
+    "http://newschallenge.tumblr.com/post/19494194541/noula-crowdsourcing-needs-mapping-and-developping",
+    "http://newschallenge.tumblr.com/post/19484970513/peepol-tv-live-tv-powered-by-and-for-the-people",
+    "http://newschallenge.tumblr.com/post/19438230966/prescouter-storify-meets-wikipedia",
+    "http://newschallenge.tumblr.com/post/19479504346/prozr-twitter-stories-in-a-snap",
+    "http://newschallenge.tumblr.com/post/19345456890/rbutr-follow-online-discourse-between-websites",
+    "http://newschallenge.tumblr.com/post/18794349346/recovers-org-community-powered-disaster-recovery",
+    "http://newschallenge.tumblr.com/post/19436424823/secure-anonymous-journalism-toolkit",
+    "http://newschallenge.tumblr.com/post/19021661497/sensor-networks-for-news",
+    "http://newschallenge.tumblr.com/post/19450685278/tethr-evolving-networks",
+    "http://newschallenge.tumblr.com/post/19293910540/the-pressforward-dashboard",
+    "http://newschallenge.tumblr.com/post/18576274733/thinkup",
+    "http://newschallenge.tumblr.com/post/19345435254/1-what-do-you-propose-to-do-20-words-build-a",
+    "http://newschallenge.tumblr.com/post/19490689958/truth-goggles",
+    "http://newschallenge.tumblr.com/post/19403515934/truth-teller",
+    "http://newschallenge.tumblr.com/post/19494011127/expand-the-unconsumption-project",
+    "http://newschallenge.tumblr.com/post/19290074949/unicef-gis-youth-led-digital-mapping",
+    "http://newschallenge.tumblr.com/post/19397319461/watchup-the-first-news-watcher",
+    "http://newschallenge.tumblr.com/post/19493588407/water-canary",
+    "http://newschallenge.tumblr.com/post/19480319147/a-bridge-between-wordpress-and-git",
+    "http://newschallenge.tumblr.com/post/19414762330/in-the-life-media-transforming-lgbt-journalism",
+    "http://newschallenge.tumblr.com/post/19493920734/get-to-the-source",
+    "http://newschallenge.tumblr.com/post/19480128205/farm-to-table-school-lunch",
+    "http://newschallenge.tumblr.com/post/19477700441/partisans-org",
+    "http://newschallenge.tumblr.com/post/19345505702/protecting-journalists-and-engaging-communities"]
+
+def find_entries():
+    page = 1
+    total_entry_count = 0
+    entries = []
+
+    while True:
+        print " ---> Found %s entries so far. Now on page: %s" % (len(entries), page)
+    
+        knight_url = "http://newschallenge.tumblr.com/page/%s" % (page)
+        html = requests.get(knight_url).content
+        soup = BeautifulSoup(html)
+        postboxes = soup.findAll("div", "postbox")
+    
+        # Done if only sticky entry is left.
+        if len(postboxes) <= 1:
+            break
+
+        page += 1
+        
+        # 15 entries per page, plus a sticky throwaway entry
+        for entry in postboxes:
+            if 'stickyPost' in entry.get('class'): continue
+        
+            total_entry_count += 1
+            likes = entry.find("", "home-likes")
+            if likes and likes.text:
+                likes = int(likes.text)
+            else:
+                likes = 0
+            
+            comments = entry.find("", "home-comments")
+            if comments and comments.text:
+                comments = int(comments.text)
+            else:
+                comments = 0
+        
+            title = entry.find("h2")
+            if title:
+                title = title.text
+            
+            url = entry.find('a', "home-view")
+            if url:
+                url = url.get('href')
+            
+            # Only record active entries
+            if comments or likes:
+                entries.append({
+                    'likes': likes,
+                    'comments': comments,
+                    'title': title,
+                    'url': url,
+                })
+        # time.sleep(random.randint(0, 2))
+    
+    entries.sort(key=lambda e: e['comments'] + e['likes'])
+    entries.reverse()
+    active_entry_count = len(entries)
+    
+    found_entries = []
+    winner_count = 0
+    for i, entry in enumerate(entries):
+        is_winner = entry['url'] in winners
+        if is_winner: winner_count += 1
+        print " * %s#%s: %s likes - [%s](%s)%s" % (
+            "**" if is_winner else "",
+            i + 1,
+            entry['likes'], entry['title'], 
+            entry['url'],
+            "**" if is_winner else "")
+        found_entries.append(entry)
+        
+    print " ***> Found %s active entries among %s total applications with %s/%s winners." % (
+        active_entry_count, total_entry_count, winner_count, len(winners))
+    return found_entries
+
+if __name__ == '__main__':
+    find_entries()