fg-booky

#!/usr/bin/env python
#
# by Erik Osheim
#
# the way to use this is:
#
# 1. open terminal
#
# 2. go to whatever directory (folder) contains this script
#
# 3. copy the HTML export that you want to play with somewhere
#
# 4. run "python booky INPUT.html > OUTPUT.csv"
#
#    INPUT.html should be the path to the input HTML, e.g.
#      ~/documents/summer-2010.html
#      ./my-data.html
#      etc...
#
#    OUTPUT.csv should be wherever you want the output to go, e.g.
#      ./my-output.csv
#      ~/documents/exports/blahblah.csv
#
#    the output CSV uses tab characters to separate fields, and newlines
#    to separate records.

import sys
from BeautifulSoup import BeautifulSoup

# p tags with these classes correspond to school name
name_tags = set(["x2-name"])

# p tags with these classes correspond to sub-headlines
address_tags = set(["x3-address"])

# p tags with these classes correspond to bylines
website_tags = set(["x3-web"])
    
# p tags with these classes correspond to body content 
generalinfo_tags = set(["x4-body-style-7-small-tight"])

# p tags with these classes correspond to bios
racialinfo_tags = set(["x5-racial-demo"])

# tags with these classes should be ignored.
#
# any div which only has these classes is not a story.
ignore_tags = set(
        ["group", "x1-section-hed"])

# these are character-sequences which should be replaced by their
# HTML entitiy equivalents. this is done in the cleanup() function.
specialchars = {
	"\xe2\x80\x99": "'",
	"\xe2\x80\x9d": '"',
	"\xe2\x80\x9c": '"',
	"\xe2\x80\x98": "'",
	"\xe2\x80\x93": "&ndash;",
	"\xe2\x80\x94": "&mdash;",
	"\xe2\x80\xa2": "&bull;",
	"\xc3\xad": "&iacute;",
	"\xc3\xa9": "&eacute;",
	"\xc3\xa1": "&aacute;",
	"\xc3\xb1": "&ntilde;",
	"\xc3\xb3": "&oacute;",
	"\xc3\xba": "&uacute;",
	"\xe2\x80\xa6": "&hellip;",
}

# this function takes a string, makes sure it has no tabs,
# turns various "specialchars" into their HTML entity equivalents
# and returns a new string.
def cleanup(s):
    if '\t' in s:
        raise Exception("illegal tab character found!!")
    for sequence in specialchars:
        replacement = specialchars[sequence]
        s = s.replace(sequence, replacement)
    return s

# this class represents a story--each of the 5 fields defaults to
# an empty string. it's mostly for organization purposes right now.
class Story(object):
    def __init__(self, name="", addresses=[], website="", generalinfo="", racialinfo=""):
        self.name = name
        self.addresses = addresses
        self.website = website
        self.generalinfo = generalinfo
        self.racialinfo = racialinfo

# set this to True when you want to see debugging output.
debugging = True

# use debug() when you want to print out test strings.
def debug(s):
    if debugging:
        print s

# This is the "main" function that runs when you run the script
if __name__ == "__main__":
    path = sys.argv[1]
    html = open(path, 'r').read()
    soup = BeautifulSoup(html)
    divs = soup.findAll('div', {'class': 'story'})
    
    debug('found %d divs' % len(divs))

    # keep track of the stories we've seen so far
    stories = []

    # loop over each div tag we found in the HTML
    for div in divs:
        # get all of its classes
        classes = set()

        # find all of the p tags it contains
        ps = div.findAll('p')

        # keep track of the important p tags
        important = []

        # loop over the p tags
        for p in ps:
            # we only care about p tags with certain classes
            c = p.get('class')
            if not c or c in ignore_tags:
                continue

            # we keep track of what classes the div's p tags
            # contain to see if the div is a story or not
            classes.add(c)

            # add this p tag to tags that have important content
            important.append(p)
            
        if (u'x4-body-style-7-small-tight' in classes):
            # we saw at least one of the classes we care about,
            # so we will make a story.
            pass
        else:
            # skip this div, because it didn't have any important
            # classes we care about.
            debug("skipping... %r" % classes)
            continue
    
        # HEADLINE , SUBHEAD , BYLINE , BODY-COPY , AUTHOR-BIO
        name = []
        addresses  = []
        website   = []
        prebody  = []
        generalinfo     = []
        racialinfo      = []
        
        # loop over our "important" tags, adding content to whichever
        # field this p applies to.
        for p in important:
            c = p.get('class')

            if c == 'subhead-3':
                data = '<h3>' + p.renderContents() + '</h3>'
		generalinfo.append(data)
		continue
            else:
                del p['class']
                data = str(p)

            if c in name_tags:
                name.append(data)
            elif c in address_tags:  
#                print address
#                class iterable:

#                    def __init__(self,address_tags):
#                        self.address_tags = address_tags
#                        self.location = 0

#                    def __iter__(self):
#                        return self

#                    def next(self):
#                        if self.location == len(self.address_tags):
#                            raise StopIteration
#                        address_tags = self.address_tags[self.location]
#                        self.location += 1
#                        return address_tags
                addresses.append(data)
            elif c in website_tags:
                website.append(data)
            elif c in generalinfo_tags:
                prebody.append(p)
            elif c in racialinfo_tags:
                racialinfo.append(data)
            else:
                raise Exception("arggg! can't handle p class=%r!!" % c)
 
	for p in prebody:
	    for span in p.findAll("span"):
		cls = span.get('class')
		txt = span.text
		if cls == 'trade-obl':
		    span.replaceWith("<em>%s</em>" % txt)
		elif cls == 'cover-body-italic':
		    span.replaceWith("<em>%s</em>" % txt)
		elif cls == 'bio-unital':
		    span.replaceWith("<em>%s</em>" % txt)
	    generalinfo.append(str(p))
        # join all the seperate p tag content into the strings we care about
        n = "".join(name)
#        a = "".join(address)
        w = "".join(website)
        g = "".join(generalinfo)
        r = "".join(racialinfo)

        story = Story(name=cleanup(n),
                      addresses=[cleanup(s) for s in addresses],
                      website=cleanup(w),
                      generalinfo=cleanup(g),
                      racialinfo=cleanup(r))

        stories.append(story)

    # first print all the heads, separated by tabs
    names = [st.name for st in stories]
    print "\t".join(names)

    # next print all the subheads
    for i in range(0, len(stories[0].addresses)):
        print "\t".join([st.addresses[i] for st in stories])
    
#    addresses = [st.address for st in stories]
#    print "\t".join(addresses)
#    print addresses
#    addy = address[0]
#    print addy + "\t"
#    phone = address[1]
#    print phone + "\t"
#    principal = address[2]
#    print principal + "\t"
#    pemail = address[3]
#    print pemail + "\t"
#    addies = []
#    for i in range(0,3):
#        address.append(i)
#        print "\t".join(addresses)
#        print addresses
#        addy = address[0]
#        print addy + "\t"
#        phone = address[1]
#        print phone + "\t"
#        principal = address[2]
#        print principal + "\t"
#        pemail = address[3]
#        print pemail + "\t"

    # next the bylines
    websites = [st.website for st in stories]
    print "\t".join(websites)

    # next the bodies
    generalinfos = [st.generalinfo for st in stories]
    print "\t".join(generalinfos)
    import re
#    print generalinfo
#    enrollment = generalinfo[0]
#    print enrollment + "\t"
#    grades = generalinfo[1]
#    print grades + "\t"
#    admin = generalinfo[2]
#    raceheader = generalinfo[3]
#    ell = generalinfo[4]
#    sped = generalinfo[5]
#    freelunch = generalinfo[6]
#    new = generalinfo[7]
#    pride = generalinfo[8]
#    ap = generalinfo[9]
#    cte = generalinfo[10]
#    otherclass = generalinfo[11]
#    flang = generalinfo[12]
#    activities = generalinfo[13]
#    boysports = generalinfo[14]
#    girlsports = generalinfo[15]
#    titles = generalinfo[16]

    # finally the author bios
    racialinfos = [st.racialinfo for st in stories]
    print "\t".join(racialinfos)