#!/usr/bin/env python2.4

from BeautifulSoup import BeautifulSoup
import os, re, calendar, datetime, sys, time, urllib2, csv
from pytz import timezone

pt = timezone('US/Pacific')

# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')

# borrowed from Ka-Ping Yee's scrape.py
def htmldecode(text):
    """Decode HTML entities in the given text."""
    if type(text) is unicode:
        uchr = unichr
    else:
        uchr = lambda value: value > 255 and unichr(value) or chr(value)
    def entitydecode(match, uchr=uchr):
        entity = match.group(1)
        if entity.startswith('#x'):
            return uchr(int(entity[2:], 16))
        elif entity.startswith('#'):
            return uchr(int(entity[1:]))
        elif entity in name2codepoint:
            return uchr(name2codepoint[entity])
        else:
            return match.group(0)
    return charrefpat.sub(entitydecode, text)

def cell_text(cell):
    return htmldecode(''.join(cell.findAll(text=True)).strip())

def parse_date(date):
    try:
        date = time.strptime(date, '%b %d %Y at %H:%M %p')
    except ValueError:
        try:
            date = list(time.strptime(date, '%b %d at %H:%M %p'))
            date[0] = time.localtime().tm_year
        except ValueError:
            day_of_week = list(calendar.day_abbr).index(date[:3])
            date = list(time.strptime(date, '%a at %H:%M %p'))
            day = datetime.date.today()
            day -= datetime.timedelta(days=day.weekday() - day_of_week % 7)
            date[:3] = day.timetuple()[:3]
    # time.strptime is broken for nonlocal time zones, hooray
    # <http://mail.python.org/pipermail/python-dev/2003-April/034543.html>
    date = time.gmtime(time.mktime(date))
    return datetime.datetime(*date[:6], **{'tzinfo': pt})

def parse_csets(f, out, last_cset_id=None):
    doc = f.read()
    soup = BeautifulSoup(doc)
    rows = soup.findAll(attrs={'class': 'GridRow'})
    for row in rows:
        cell_download, cell_browse, cell_date, cell_comment, cell_cset, \
                       cell_by, cell_downloads = row.findAll('td')
        download_url = htmldecode(cell_download.find('a')['href'])
        # use Mercurial's default date format
        date = parse_date(cell_text(cell_date)).strftime('%Y-%m-%d %H:%M:%S')
        comment = cell_text(cell_comment)
        cset_id = cell_text(cell_cset)
        by = cell_text(cell_by)
        if cset_id == last_cset_id:
            return
        out.writerow((cset_id, download_url, date, comment, by))
        print '%6s@%s %-8s %s' % (cset_id, date, by, comment)
    assert last_cset_id is None, 'too many checkins since last run'

if __name__ == '__main__':
    if len(sys.argv) == 1:
        f = urllib2.urlopen('http://www.codeplex.com/IronPython/SourceControl/ListDownloadableCommits.aspx')
        old_commits = csv.reader(file('commits.csv'))
        out = csv.writer(file('commits-temp.csv', 'w'))
        last_cset = old_commits.next()
        parse_csets(f, out, last_cset_id=last_cset[0])
        out.writerow(last_cset)
        out.writerows(old_commits)
        del out
        os.rename('commits-temp.csv', 'commits.csv')
    else:
        for f in sys.argv[1:]:
            parse_csets(file(f), csv.writer(file('commits.csv', 'w')))