forerad

Utilities for collecting and analyzing with Citibike data in Python
Log | Files | Refs | README

commit 2b2aa3aa29a69050509c06b1779604ce5d0ef376
parent 113556897b454281deb8858ff9998a4ab30c4f76
Author: Steve Gattuso <steve@stevegattuso.me>
Date:   Sun,  5 Nov 2023 13:54:20 +0100

refactor schema version detection

Diffstat:
Mbin/scraper | 15++++-----------
Mforerad/persistence.py | 54++++++++++++++++++++++++++++++++----------------------
Mforerad/scrapers/historical.py | 7-------
3 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/bin/scraper b/bin/scraper @@ -26,15 +26,6 @@ def is_persisted(archive: scrape_historical.MonthlyArchive) -> bool: return len(trips) > 0 -def fetch_and_store_archive(archive: scrape_historical.MonthlyArchive): - trips = scrape_historical.fetch_archive_dt(archive) - - if archive.schema_version == 1: - store.store_v1_trips(trips) - else: - store.store_v2_trips(trips) - - def main__fetch(args: argparse.Namespace): archives = scrape_historical.fetch_archives() @@ -51,7 +42,9 @@ def main__fetch(args: argparse.Namespace): continue logging.info(f'Fetching and storing {month_str}') - fetch_and_store_archive(archive) + + trips = scrape_historical.fetch_archive_dt(archive) + store.store_trips(trips) def main__list(): @@ -60,7 +53,7 @@ def main__list(): print('Available archives:\n') for archive in archives: presence = '[x]' if is_persisted(archive) else '[ ]' - print(f'\t{presence} {archive.date.strftime("%Y-%m")} (v{archive.schema_version})') + print(f'\t{presence} {archive.date.strftime("%Y-%m")}') if __name__ == '__main__': diff --git a/forerad/persistence.py b/forerad/persistence.py @@ -63,31 +63,40 @@ class SQLiteStore(): index=False, ) - def store_v1_trips(self, df): + def store_trips(self, df): """ - Stores a dataframe of historical trips in the v1 schema format + Stores a dataframe of historical trips, transforming the data depending + on the schema it detects """ - df = df.rename(columns={ - # Account for a weird in-between schema that starts on 2016-10 - 'Start Time': 'started_at', - 'Stop Time': 'ended_at', - 'Start Station ID': 'start_station_id', - 'End Station ID': 'end_station_id', - 'Start Station Latitude': 'start_lat', - 'Start Station Longitude': 'start_lng', - 'End Station Latitude': 'end_lat', - 'End Station Longitude': 'end_lng', - # ...back to our normally scheduled program: - 'starttime': 'started_at', - 'stoptime': 'ended_at', - 'start station id': 'start_station_id', - 'end station id': 'end_station_id', - 'start station latitude': 'start_lat', - 'start station longitude': 'start_lng', - 'end station latitude': 'end_lat', - 'end station longitude': 'end_lng', - }) + if 'starttime' in df.columns: + # Transform v1 schema + df = df.rename(columns={ + 'starttime': 'started_at', + 'stoptime': 'ended_at', + 'start station id': 'start_station_id', + 'end station id': 'end_station_id', + 'start station latitude': 'start_lat', + 'start station longitude': 'start_lng', + 'end station latitude': 'end_lat', + 'end station longitude': 'end_lng', + }) + elif 'Start Time' in df.columns: + # This is a weird in-between state of v1 and v2 that starts in the + # 2016-10 dataset + df = df.rename(columns={ + 'Start Time': 'started_at', + 'Stop Time': 'ended_at', + 'Start Station ID': 'start_station_id', + 'End Station ID': 'end_station_id', + 'Start Station Latitude': 'start_lat', + 'Start Station Longitude': 'start_lng', + 'End Station Latitude': 'end_lat', + 'End Station Longitude': 'end_lng', + }) + else: + # The v2 dataset should work with no transformations + pass return self.__store_formatted(df) @@ -95,4 +104,5 @@ class SQLiteStore(): """ Stores a dataframe of historical trips in the newer v2 schema format """ + import pdb; pdb.set_trace() return self.__store_formatted(df) diff --git a/forerad/scrapers/historical.py b/forerad/scrapers/historical.py @@ -39,13 +39,6 @@ class MonthlyArchive(): return f'<MonthlyArchive {self.date} {self.object_key} />' @property - def schema_version(self): - if '.csv.zip' in self.object_key: - return 2 - - return 1 - - @property def csv_name(self) -> str: if '.csv.zip' in self.object_key: return self.object_key.replace('.csv.zip', '.csv')