Fundraising 2012/FundStatScraper.py

Python 2.7 script for extracting tabular data from the Fundraising Statistics page.

May save data out of order.

#!/usr/bin/python
"""Wikimedia Foundation Fundraiser Statistics Scraper

This script will load and scrape data from http://wikimediafoundation.org/wiki/Special:FundraiserStatistics
and place it into a CSV file. It takes one argument; the CSV file name.
"""

import sys
import urllib2
import xml.dom.minidom

if len(sys.argv) != 2:
    print("This script needs 1 argument: A path to a writable CSV file. If the file does not exist it will be created.")
    exit()

def floatC(s):
    return float(str(s).replace(",", ""))

print "Obtaining page."
data = urllib2.urlopen(urllib2.Request(
    'http://wikimediafoundation.org/wiki/Special:FundraiserStatistics',
    headers={'User-Agent':'FundStatScrapeBot'}
))

print "Parsing DOM."
dom = xml.dom.minidom.parseString(data.read())

print "Opening output file."
out = file(sys.argv[1], 'w')
out.write("date, dayTotal, contributions, avg, max, cumTotal\n")

print "Iterating DOM."
for div in dom.getElementsByTagName('div'):
    if div.getAttribute('class') == 'fundraiserstats-view-box':
        date = div.firstChild.firstChild.firstChild.firstChild.firstChild.nodeValue
        r2 = div.firstChild.firstChild.nextSibling
        r3 = r2.nextSibling

        dayTotal = floatC(r2.firstChild.nextSibling.firstChild.nodeValue)
        contributions = floatC(r2.firstChild.nextSibling.nextSibling.nextSibling.firstChild.nodeValue)
        avg = floatC(r2.firstChild.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling.firstChild.nodeValue)

        max = floatC(r3.firstChild.nextSibling.firstChild.nodeValue)
        cumTotal = floatC(r3.firstChild.nextSibling.nextSibling.nextSibling.firstChild.nodeValue)

        out.write("%s, %0.2f, %d, %0.2f, %0.2f, %0.2f\n" % (date, dayTotal, contributions, avg, max, cumTotal))
    else:
        continue

out.flush()
out.close()

print "Done."