#!/usr/bin/env python2.4 from BeautifulSoup import BeautifulSoup import os, re, calendar, datetime, sys, time, urllib2, csv from pytz import timezone pt = timezone('US/Pacific') # This pattern matches a character entity reference (a decimal numeric # references, a hexadecimal numeric reference, or a named reference). charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?') # borrowed from Ka-Ping Yee's scrape.py def htmldecode(text): """Decode HTML entities in the given text.""" if type(text) is unicode: uchr = unichr else: uchr = lambda value: value > 255 and unichr(value) or chr(value) def entitydecode(match, uchr=uchr): entity = match.group(1) if entity.startswith('#x'): return uchr(int(entity[2:], 16)) elif entity.startswith('#'): return uchr(int(entity[1:])) elif entity in name2codepoint: return uchr(name2codepoint[entity]) else: return match.group(0) return charrefpat.sub(entitydecode, text) def cell_text(cell): return htmldecode(''.join(cell.findAll(text=True)).strip()) def parse_date(date): try: date = time.strptime(date, '%b %d %Y at %H:%M %p') except ValueError: try: date = list(time.strptime(date, '%b %d at %H:%M %p')) date[0] = time.localtime().tm_year except ValueError: day_of_week = list(calendar.day_abbr).index(date[:3]) date = list(time.strptime(date, '%a at %H:%M %p')) day = datetime.date.today() day -= datetime.timedelta(days=day.weekday() - day_of_week % 7) date[:3] = day.timetuple()[:3] # time.strptime is broken for nonlocal time zones, hooray # date = time.gmtime(time.mktime(date)) return datetime.datetime(*date[:6], **{'tzinfo': pt}) def parse_csets(f, out, last_cset_id=None): doc = f.read() soup = BeautifulSoup(doc) rows = soup.findAll(attrs={'class': 'GridRow'}) for row in rows: cell_download, cell_browse, cell_date, cell_comment, cell_cset, \ cell_by, cell_downloads = row.findAll('td') download_url = htmldecode(cell_download.find('a')['href']) # use Mercurial's default date format date = parse_date(cell_text(cell_date)).strftime('%Y-%m-%d %H:%M:%S') comment = cell_text(cell_comment) cset_id = cell_text(cell_cset) by = cell_text(cell_by) if cset_id == last_cset_id: return out.writerow((cset_id, download_url, date, comment, by)) print '%6s@%s %-8s %s' % (cset_id, date, by, comment) assert last_cset_id is None, 'too many checkins since last run' if __name__ == '__main__': if len(sys.argv) == 1: f = urllib2.urlopen('http://www.codeplex.com/IronPython/SourceControl/ListDownloadableCommits.aspx') old_commits = csv.reader(file('commits.csv')) out = csv.writer(file('commits-temp.csv', 'w')) last_cset = old_commits.next() parse_csets(f, out, last_cset_id=last_cset[0]) out.writerow(last_cset) out.writerows(old_commits) del out os.rename('commits-temp.csv', 'commits.csv') else: for f in sys.argv[1:]: parse_csets(file(f), csv.writer(file('commits.csv', 'w')))