# coding: utf-8 # debianforum.de donation plot # written by TRex, modified by Meillo from sys import argv if len(argv) != 2: print "Usage: "+ argv[0] +" OUTFILE.png" exit(1) outfile = argv[1] # In[1]: from datetime import datetime years = range(2003, datetime.now().year + 1) donations_url = "https://wiki.debianforum.de/Debianforum.de/Spenden" expenses_url = "https://wiki.debianforum.de/Debianforum.de/Ausgaben" # In[2]: import urllib urls = {year: (donations_url + '/' + str(year) if year < years[-1] else donations_url) for year in years} y = dict() for year, url in urls.items(): f = urllib.urlopen(url) y[year] = f.read() expense_html = urllib.urlopen(expenses_url).read() # In[3]: def parse_html_to_list(html): """ Parse HTML table to list of tuples, expecting three cells with date, note, amount in each row. """ data = [] parser = etree.HTMLParser() tree = etree.parse(StringIO(html.decode('utf-8')), parser) rows = tree.xpath('//table//tr') for index, row in enumerate(rows): cells = row.getchildren() if len(cells) == 3 and index > 0: date = cells[0].text.strip() try: date = datetime.strptime(date, "%d.%m.%Y") except Exception as e: continue note = cells[1].xpath(".//a/text()") if not note: note = cells[1].xpath(".//text()") amount = float(cells[2].text.strip().replace(",",".")) data.append((date, note[0].strip(), amount)) return data # In[4]: from io import StringIO, BytesIO from lxml import etree donations = [] for year, donate_html in y.items(): yearly_list = parse_html_to_list(donate_html) donations += yearly_list donations.sort() # In[5]: expenses = parse_html_to_list(expense_html) # In[6]: sum([x[2] for x in expenses]) # In[7]: sum([x[2] for x in donations]) # In[8]: sum([x[2] for x in donations]) - sum([x[2] for x in expenses]) # Donations/month # In[9]: from itertools import groupby import numpy as np import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt from datetime import date, timedelta #get_ipython().magic(u'matplotlib inline') def _groupy(item): return item[0].year, item[0].month def get_date_grouped_list(list_): agg_list = [] for ( (year, month), items ) in groupby( list_, _groupy ): agg_list.append( ( date(year, month, 1), sum([x[2] for x in items])) ) return agg_list donations_prefix = [(date(donations[0][0].year, x+1, 1), '', 0) for x in range(0, donations[0][0].month-1)] # cheating. Now even better. agg_donations = get_date_grouped_list(donations_prefix + donations) agg_expenses = get_date_grouped_list(donations_prefix + expenses) agg_donations.sort() agg_expenses.sort() fig = plt.figure(figsize=(15,7)) # donations plt.plot([x[0] for x in agg_donations], [x[1] for x in agg_donations], color="green", label="Spenden") # expenses plt.plot([x[0] for x in agg_expenses], [x[1] for x in agg_expenses], color="red", label="Ausgaben") # spartopf cash_available = [] cash = 0 d = dict(agg_donations) # shitty workaround: this month is missing. probably a bug in the source table. for k,v in agg_expenses + [(date(2009, 3, 1), 0)]: donation = d.pop(k, 0) cash = cash - v + donation cash_available.append((k, cash)) cash_available.sort() plt.plot([x[0] for x in cash_available], [x[1] for x in cash_available], color="blue", label="Spendenkonto") plt.xlabel("Datum") plt.ylabel("Euro") plt.legend() plt.savefig(outfile) plt.close(fig) # In[10]: exit() # csv thing for x in agg_plot: print "%s;%.2f" % (x[0].strftime("%Y-%m-%d"), x[1]) # In[ ]: # csv thing for x in donations: print "%s;\"%s\";%.2f" % (x[0].strftime("%Y-%m-%d"), x[1], x[2])