scraper (2786B)
1 #!/usr/bin/env python3 2 """ 3 This script allows you to download citibike ride history archives into your local database. 4 """ 5 import sys 6 import argparse 7 import pandas as pd 8 9 import forerad.persistence as persistence 10 import forerad.utils as utils 11 import forerad.scrapers.historical as scrape_historical 12 13 14 store = persistence.SQLiteStore() 15 16 17 def main__fetch(args: argparse.Namespace): 18 archives = scrape_historical.HistoricalTripArchive.list_remote() 19 20 if args.month is not None: 21 archives = [a for a in archives if a.date.strftime('%Y-%m') == args.month] 22 if args.month and len(archives) != 1: 23 utils.logger.error(f'Month filter "{args.month}" yielded {len(archives)} results. Aborting!') 24 sys.exit(1) 25 26 for archive in archives: 27 month_str = archive.date.strftime("%Y-%m") 28 if archive.is_downloaded: 29 utils.logger.info(f'{month_str} is already persisted, skipping.') 30 continue 31 32 utils.logger.info(f'Fetching and storing {month_str}') 33 archive.fetch_df() 34 35 36 def main__schema_report(args: argparse.Namespace): 37 """ 38 Provides a nice lil report that shows which schema version each of the 39 archives uses. If you use this with the --month argument it will also print 40 out all of the column headers. 41 """ 42 archives = scrape_historical.HistoricalTripArchive.list_cached() 43 44 if args.month is not None: 45 archives = [a for a in archives if a.month_str == args.month] 46 47 print('Archive schema report:') 48 for archive in archives: 49 df = archive.fetch_df(normalize=False) 50 if 'starttime' in df.columns: 51 print(f'- {archive.month_str}: v1') 52 elif 'Start Time' in df.columns: 53 print(f'- {archive.month_str}: v2') 54 elif 'started_at' in df.columns: 55 print(f'- {archive.month_str}: v3') 56 else: 57 print(f'- {archive.month_str}: ??') 58 59 if args.month: 60 print(f"Columns: {', '.join(df.columns)}") 61 62 def main__list(): 63 archives = scrape_historical.HistoricalTripArchive.list_remote() 64 65 print('Available archives:\n') 66 for archive in archives: 67 presence = '[x]' if archive.is_downloaded else '[ ]' 68 print(f'\t{presence} {archive.date.strftime("%Y-%m")}') 69 70 71 if __name__ == '__main__': 72 parser = argparse.ArgumentParser( 73 description="Backfill historical trip data into your data store", 74 ) 75 parser.add_argument('action', choices=['list', 'fetch', 'schema-report']) 76 parser.add_argument('--month', help="The month to download, in YYYY-MM format") 77 78 args = parser.parse_args() 79 80 if args.action == 'fetch': 81 main__fetch(args) 82 elif args.action == 'schema-report': 83 main__schema_report(args) 84 elif args.action == 'list': 85 main__list()