##################################### #digg network scraper, by bernie. #Bernie Hogan #NetLab, University of Toronto # #1. get the friend page for any given person. #2. scrape the friends on this page and repeat for every page of links. #3. check for duplicates. # #http://www.digg.com/users/[name]/friends/list/page[d] #you can tell the last page as the one after will have no links. #each friend will have a handle, some also have a name. #If they have a name, the handle will be in parenthesis. #its actually a joke. import urllib, re, string def scrapeDiggPage(pagetext, scrapeType, user=""): if scrapeType == 'users': userlist = [] tmplist = re.compile("href=\"/users/\w*").findall(pagetext) tmplist = map(lambda x:x[13:], tmplist) finaldict = {} for j in tmplist: if j != user and not finaldict.has_key(j) and len(j) > 0: finaldict[j] = j if len(finaldict) > 0: return finaldict.keys() else: return None else: return None digguser = "GrilledOnion" print "Friends of " + digguser + ":\n" userfriends = [] i = 1 while i: print "page " + str(i) userpage = open(urllib.urlretrieve("http://www.digg.com/users/" + digguser + "/friends/list/page" + str(i))[0]).read() x = scrapeDiggPage(userpage, 'users', digguser) if x: userfriends = userfriends + x else: break i += 1 print "Those who befriended " + digguser + ":\n" userbf = [] i = 1 while i: print "page " + str(i) userpage = open(urllib.urlretrieve("http://www.digg.com/users/" + digguser + "/friends/befriended/page" + str(i))[0]).read() x = scrapeDiggPage(userpage, 'users', digguser) if x: userbf = userbf + x else: break i += 1 frset = set(userfriends) bfset = set(userbf) total = set.union(frset, bfset) print "Total Notes: " + str(len(total)) print total mutual = set.intersection(frset, bfset) print "Mutual Nodes: " + str(len(mutual)) print mutual fileout = open("/Users/bernie/Documents/Projects/diggnetnation/outnet_" + digguser + ".gdf", 'w') fileout.write("nodedef> name\n" + digguser + "\n") nodelist = [] for i in mutual: nodelist.append(i) fileout.write(string.join(nodelist,"\n") + "\nedgedef> node1,node2,\n") fileout.flush() # now that we know who the user's friends are, we can look to see if any of these people are friends of each other # basically - we go throug the same process - except in a FAR more computationally expensive algorithm that (in all reality) could be optimized. # but I couldn't be bothered. for j in mutual: print "Friends of " + j + ":\n" userfriends = [] i = 1 while i: print "page " + str(i) userpage = open(urllib.urlretrieve("http://www.digg.com/users/" + j + "/friends/list/page" + str(i))[0]).read() x = scrapeDiggPage(userpage, 'users', j) if x: userfriends = userfriends + x else: break i += 1 print "Those who befriended " + j + ":\n" userbf = [] i = 1 while i: print "page " + str(i) userpage = open(urllib.urlretrieve("http://www.digg.com/users/" + j + "/friends/befriended/page" + str(i))[0]).read() x = scrapeDiggPage(userpage, 'users', j) if x: userbf = userbf + x else: break i += 1 frset = set(userfriends) bfset = set(userbf) total = set.union(frset, bfset) mutuala = set.intersection(frset, bfset) print "Mutual Nodes: " + str(len(mutuala)) print mutuala edgeset = mutual.intersection(mutuala) print edgeset outstr = '' for k in edgeset: outstr += j + "," + k + "\n" fileout.write(outstr) fileout.flush() fileout.close() #Notes: #ways of filtering: people who haven't dugg anything in the last N days or #within a certain time frame.