# -*- coding: utf-8 -*- import re import urllib2 YEARS = range(2002, 2014) SEASONS = ('spr', 'sum', 'aut', 'win') def get_url(year, season): return 'http://chgk.tvigra.ru/letopis/?%(year)s/%(year)s_%(season)s' % { 'year': year, 'season': season } def get_html(url): response = urllib2.urlopen(url) return response.read().decode('cp1251').encode('utf-8') def extract_games(html): return re.split( r'().*?', html, flags=re.DOTALL )[2::2] def parse_game(game): date = re.search(r' (.*?) ', game).group(1) score = re.findall(r'>(\d)<', game) excl = re.search( r'При сч.{2}те (\d):(\d).*?ешающ.*?<', game, flags=re.DOTALL ) if excl is not None: excl = excl.groups() else: excl = '' return { 'date': date, 'score': ':'.join(score), 'excl': ':'.join(excl) } if __name__ == "__main__": with open('letopis.csv', 'w') as f: for year in YEARS: for season in SEASONS: url = get_url(year, season) print url html = get_html(url) for game in extract_games(html): data_str = "%(date)s,%(score)s,%(excl)s" % parse_game(game) f.write("%s,%s,%s,%s\n" % (url, year, season, data_str)) print "Completed."