commit 2b2aa3aa29a69050509c06b1779604ce5d0ef376
parent 113556897b454281deb8858ff9998a4ab30c4f76
Author: Steve Gattuso <steve@stevegattuso.me>
Date: Sun, 5 Nov 2023 13:54:20 +0100
refactor schema version detection
Diffstat:
3 files changed, 36 insertions(+), 40 deletions(-)
diff --git a/bin/scraper b/bin/scraper
@@ -26,15 +26,6 @@ def is_persisted(archive: scrape_historical.MonthlyArchive) -> bool:
return len(trips) > 0
-def fetch_and_store_archive(archive: scrape_historical.MonthlyArchive):
- trips = scrape_historical.fetch_archive_dt(archive)
-
- if archive.schema_version == 1:
- store.store_v1_trips(trips)
- else:
- store.store_v2_trips(trips)
-
-
def main__fetch(args: argparse.Namespace):
archives = scrape_historical.fetch_archives()
@@ -51,7 +42,9 @@ def main__fetch(args: argparse.Namespace):
continue
logging.info(f'Fetching and storing {month_str}')
- fetch_and_store_archive(archive)
+
+ trips = scrape_historical.fetch_archive_dt(archive)
+ store.store_trips(trips)
def main__list():
@@ -60,7 +53,7 @@ def main__list():
print('Available archives:\n')
for archive in archives:
presence = '[x]' if is_persisted(archive) else '[ ]'
- print(f'\t{presence} {archive.date.strftime("%Y-%m")} (v{archive.schema_version})')
+ print(f'\t{presence} {archive.date.strftime("%Y-%m")}')
if __name__ == '__main__':
diff --git a/forerad/persistence.py b/forerad/persistence.py
@@ -63,31 +63,40 @@ class SQLiteStore():
index=False,
)
- def store_v1_trips(self, df):
+ def store_trips(self, df):
"""
- Stores a dataframe of historical trips in the v1 schema format
+ Stores a dataframe of historical trips, transforming the data depending
+ on the schema it detects
"""
- df = df.rename(columns={
- # Account for a weird in-between schema that starts on 2016-10
- 'Start Time': 'started_at',
- 'Stop Time': 'ended_at',
- 'Start Station ID': 'start_station_id',
- 'End Station ID': 'end_station_id',
- 'Start Station Latitude': 'start_lat',
- 'Start Station Longitude': 'start_lng',
- 'End Station Latitude': 'end_lat',
- 'End Station Longitude': 'end_lng',
- # ...back to our normally scheduled program:
- 'starttime': 'started_at',
- 'stoptime': 'ended_at',
- 'start station id': 'start_station_id',
- 'end station id': 'end_station_id',
- 'start station latitude': 'start_lat',
- 'start station longitude': 'start_lng',
- 'end station latitude': 'end_lat',
- 'end station longitude': 'end_lng',
- })
+ if 'starttime' in df.columns:
+ # Transform v1 schema
+ df = df.rename(columns={
+ 'starttime': 'started_at',
+ 'stoptime': 'ended_at',
+ 'start station id': 'start_station_id',
+ 'end station id': 'end_station_id',
+ 'start station latitude': 'start_lat',
+ 'start station longitude': 'start_lng',
+ 'end station latitude': 'end_lat',
+ 'end station longitude': 'end_lng',
+ })
+ elif 'Start Time' in df.columns:
+ # This is a weird in-between state of v1 and v2 that starts in the
+ # 2016-10 dataset
+ df = df.rename(columns={
+ 'Start Time': 'started_at',
+ 'Stop Time': 'ended_at',
+ 'Start Station ID': 'start_station_id',
+ 'End Station ID': 'end_station_id',
+ 'Start Station Latitude': 'start_lat',
+ 'Start Station Longitude': 'start_lng',
+ 'End Station Latitude': 'end_lat',
+ 'End Station Longitude': 'end_lng',
+ })
+ else:
+ # The v2 dataset should work with no transformations
+ pass
return self.__store_formatted(df)
@@ -95,4 +104,5 @@ class SQLiteStore():
"""
Stores a dataframe of historical trips in the newer v2 schema format
"""
+ import pdb; pdb.set_trace()
return self.__store_formatted(df)
diff --git a/forerad/scrapers/historical.py b/forerad/scrapers/historical.py
@@ -39,13 +39,6 @@ class MonthlyArchive():
return f'<MonthlyArchive {self.date} {self.object_key} />'
@property
- def schema_version(self):
- if '.csv.zip' in self.object_key:
- return 2
-
- return 1
-
- @property
def csv_name(self) -> str:
if '.csv.zip' in self.object_key:
return self.object_key.replace('.csv.zip', '.csv')