#####################################
#digg network scraper, by bernie. 
#Bernie Hogan
#NetLab, University of Toronto 
#
#1. get the friend page for any given person. 
#2. scrape the friends on this page and repeat for every page of links. 
#3. check for duplicates. 
#

#http://www.digg.com/users/[name]/friends/list/page[d]
#you can tell the last page as the one after will have no links. 
#each friend will have a handle, some also have a name. 
#If they have a name, the handle will be in parenthesis. 

#its actually a joke. 

import urllib, re, string

def scrapeDiggPage(pagetext, scrapeType, user=""):

    if scrapeType == 'users':
        userlist = []
        tmplist = re.compile("href=\"/users/\w*").findall(pagetext)

        tmplist = map(lambda x:x[13:], tmplist)
        finaldict = {}
        for j in tmplist:
            if j != user and not finaldict.has_key(j) and len(j) > 0:
                finaldict[j] = j
        if len(finaldict) > 0:
            return finaldict.keys()
        else:
            return None
    else:
        return None

digguser = "GrilledOnion"

print "Friends of " + digguser + ":\n"

userfriends = []
i = 1
while i: 
    print "page " + str(i)
    userpage = open(urllib.urlretrieve("http://www.digg.com/users/" + digguser + "/friends/list/page" + str(i))[0]).read()
    x = scrapeDiggPage(userpage, 'users', digguser)
    if x:
        userfriends = userfriends + x
    else:
        break
    i += 1

print "Those who befriended " + digguser + ":\n"

userbf = []
i = 1
while i: 

    print "page " + str(i)
    userpage = open(urllib.urlretrieve("http://www.digg.com/users/" + digguser + "/friends/befriended/page" + str(i))[0]).read()
    x = scrapeDiggPage(userpage, 'users', digguser)
    if x:
        userbf = userbf + x
    else:
        break
    i += 1

frset = set(userfriends)
bfset = set(userbf)
total = set.union(frset, bfset)
print "Total Notes: " + str(len(total))
print total
mutual = set.intersection(frset, bfset)
print "Mutual Nodes: " + str(len(mutual))
print mutual

fileout = open("/Users/bernie/Documents/Projects/diggnetnation/outnet_" + digguser + ".gdf", 'w')
fileout.write("nodedef> name\n" + digguser + "\n")
nodelist = []
for i in mutual:
    nodelist.append(i)
fileout.write(string.join(nodelist,"\n") + "\nedgedef> node1,node2,\n")
fileout.flush()
# now that we know who the user's friends are, we can look to see if any of these people are friends of each other
# basically - we go throug the same process - except in a FAR more computationally expensive algorithm that (in all reality) could be optimized. 
# but I couldn't be bothered. 

for j in mutual:
    print "Friends of " + j + ":\n"

    userfriends = []
    i = 1
    while i: 
        print "page " + str(i)
        userpage = open(urllib.urlretrieve("http://www.digg.com/users/" + j + "/friends/list/page" + str(i))[0]).read()
        x = scrapeDiggPage(userpage, 'users', j)
        if x:
            userfriends = userfriends + x
        else:
            break
        i += 1
    
    print "Those who befriended " + j + ":\n"
    
    userbf = []
    i = 1
    while i: 
    
        print "page " + str(i)
        userpage = open(urllib.urlretrieve("http://www.digg.com/users/" + j + "/friends/befriended/page" + str(i))[0]).read()
        x = scrapeDiggPage(userpage, 'users', j)
        if x:
            userbf = userbf + x
        else:
            break
        i += 1

    frset = set(userfriends)
    bfset = set(userbf)
    total = set.union(frset, bfset)
    mutuala = set.intersection(frset, bfset)
    print "Mutual Nodes: " + str(len(mutuala))
    print mutuala

    edgeset = mutual.intersection(mutuala)
    print edgeset
    outstr = ''
    for k in edgeset:
        outstr += j + "," + k + "\n"
    fileout.write(outstr)
    fileout.flush()

fileout.close()

#Notes: 
#ways of filtering: people who haven't dugg anything in the last N days or 
#within a certain time frame.