forerad

Utilities for collecting and analyzing with Citibike data in Python
Log | Files | Refs | README

scraper (2786B)


      1 #!/usr/bin/env python3
      2 """
      3 This script allows you to download citibike ride history archives into your local database.
      4 """
      5 import sys
      6 import argparse
      7 import pandas as pd
      8 
      9 import forerad.persistence as persistence
     10 import forerad.utils as utils
     11 import forerad.scrapers.historical as scrape_historical
     12 
     13 
     14 store = persistence.SQLiteStore()
     15 
     16 
     17 def main__fetch(args: argparse.Namespace):
     18     archives = scrape_historical.HistoricalTripArchive.list_remote()
     19 
     20     if args.month is not None:
     21         archives = [a for a in archives if a.date.strftime('%Y-%m') == args.month]
     22     if args.month and len(archives) != 1:
     23         utils.logger.error(f'Month filter "{args.month}" yielded {len(archives)} results. Aborting!')
     24         sys.exit(1)
     25 
     26     for archive in archives:
     27         month_str = archive.date.strftime("%Y-%m")
     28         if archive.is_downloaded:
     29             utils.logger.info(f'{month_str} is already persisted, skipping.')
     30             continue
     31 
     32         utils.logger.info(f'Fetching and storing {month_str}')
     33         archive.fetch_df()
     34 
     35 
     36 def main__schema_report(args: argparse.Namespace):
     37     """
     38     Provides a nice lil report that shows which schema version each of the
     39     archives uses. If you use this with the --month argument it will also print
     40     out all of the column headers.
     41     """
     42     archives = scrape_historical.HistoricalTripArchive.list_cached()
     43 
     44     if args.month is not None:
     45         archives = [a for a in archives if a.month_str == args.month]
     46 
     47     print('Archive schema report:')
     48     for archive in archives:
     49         df = archive.fetch_df(normalize=False)
     50         if 'starttime' in df.columns:
     51             print(f'- {archive.month_str}: v1')
     52         elif 'Start Time' in df.columns:
     53             print(f'- {archive.month_str}: v2')
     54         elif 'started_at' in df.columns:
     55             print(f'- {archive.month_str}: v3')
     56         else:
     57             print(f'- {archive.month_str}: ??')
     58 
     59         if args.month:
     60             print(f"Columns: {', '.join(df.columns)}")
     61 
     62 def main__list():
     63     archives = scrape_historical.HistoricalTripArchive.list_remote()
     64 
     65     print('Available archives:\n')
     66     for archive in archives:
     67         presence = '[x]' if archive.is_downloaded else '[ ]'
     68         print(f'\t{presence} {archive.date.strftime("%Y-%m")}')
     69 
     70 
     71 if __name__ == '__main__':
     72     parser = argparse.ArgumentParser(
     73         description="Backfill historical trip data into your data store",
     74     )
     75     parser.add_argument('action', choices=['list', 'fetch', 'schema-report'])
     76     parser.add_argument('--month', help="The month to download, in YYYY-MM format")
     77 
     78     args = parser.parse_args()
     79 
     80     if args.action == 'fetch':
     81         main__fetch(args)
     82     elif args.action == 'schema-report':
     83         main__schema_report(args)
     84     elif args.action == 'list':
     85         main__list()