calculate hourly volume rollup - forerad - Utilities for collecting and analyzing with Citibike data in Python

commit 6da36f7e4f10e281122a50db81de2b28bffad452
parent 711944d8bc88d9be4dc2e2c76264208080873209
Author: Steve Gattuso <steve@stevegattuso.me>
Date:   Mon,  6 Nov 2023 10:20:11 +0100

calculate hourly volume rollup

Diffstat:
M bin/hourly-volume-rollup  | 81 ++++++++++++++++++++++++++++++++-----------------------------------------------
M forerad/persistence.py  | 147 ++++++++++++++-----------------------------------------------------------------
M forerad/scrapers/historical.py  | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M forerad/utils.py  | 12 ++++++++++++

4 files changed, 126 insertions(+), 174 deletions(-)
diff --git a/bin/hourly-volume-rollup b/bin/hourly-volume-rollup
@@ -16,69 +16,54 @@ log = utils.get_logger()
 # The day that Citibike began publishing data on
 ORIGIN_DATE = datetime.date(2013, 6, 1)
 
-def detect_missing_members(year: int, month: int) -> set[datetime.datetime]:
+def detect_status(year: int, month: int) -> tuple[set[pd.Timestamp], set[pd.Timestamp], set[pd.Timestamp]]:
     """
     Given a month, look at the hourly rollup and generate a set of missing
     members
     """
-forerad/scrapers/historical
-
-
-def main__populate(month_str):
-    # Calculate the last day of last month
-    most_recent_trip_dt = store.fetch_latest_trip_dt()
-    if most_recent_trip_dt is None:
-        raise Exception('No trips found!')
-
-    # Calculate dataframe of all days since the ORIGIN_DATE
-    to_populate = set(pd.date_range(
-        start=ORIGIN_DATE,
-        end=most_recent_trip_dt.date(),
-        freq="D"
+    first_day = datetime.date(year, month, 1)
+    next_month = utils.next_month(first_day.year, first_day.month)
+    expected_members = set(pd.date_range(
+        start=first_day,
+        end=next_month,
+        freq="1H",
+        inclusive="left"
     ))
-    # If provided, filter out any dates not in the specified month
-    if month_str is not None:
-        year, month = utils.parse_month_str(month_str)
-        to_populate = {d for d in to_populate if d.year == year and d.month == month}
 
-    rollup = store.fetch_daily_volume_rollup()
-    if len(rollup) > 0:
-        populated_dates = set(rollup['date'])
-        to_populate -= populated_dates
+    rollup = store.fetch_hourly_volume_rollup(first_day, next_month)
+    actual_members = set(rollup.index.values)
 
-    if len(to_populate) == 0:
-        log.info('Nothing remaining to populate. Exiting!')
-        return
+    missing = expected_members - actual_members
+    return expected_members, actual_members, missing
 
-    log.info(f"{len(to_populate)} members need to be populated")
 
-    to_populate = sorted(list(to_populate))
-    to_insert: list[tuple[datetime.date, int]] = []
-    prev_month = None
-    for date in to_populate:
-        # Ratelimit logging
-        cur_month = date.strftime('%Y-%m')
-        if cur_month != prev_month:
-            log.info(f"Rolling up {cur_month}")
-            prev_month = cur_month
+def derive_rollup(a: historical.HistoricalTripArchive):
+    df = a.fetch_df()[['started_at']].reset_index()
+    grouper = pd.Grouper(key='started_at', freq='1H')
+    df = df.groupby(grouper).count()\
+        .rename(columns={'tempid': 'trip_count'})\
+        .reset_index()
 
-        trip_count = store.fetch_daily_volume(date)
-        to_insert.append((date, trip_count))
+    store.write_hourly_volume_rollup(df)
+    log.info(f"Wrote {len(df)} members to table")
 
-    log.info(f"Writing {len(to_insert)} rows to rollup")
-    store.write_daily_volume_rollup(to_insert)
+def main__populate(month_str):
+    archives = historical.HistoricalTripArchive.list_cached()
+    if month_str is not None:
+        year, month = utils.parse_month_str(month_str)
+        archives = [a for a in archives if a.date.year == year and a.date.month == month]
+
+    [derive_rollup(a) for a in archives]
 
 
 def main__status():
-    cache = historical.ArchiveCache()
-    available_archives = cache.list_archives()
+    available_archives = historical.HistoricalTripArchive.list_cached()
+
+    print('Archive status:')
     for archive in available_archives:
-        first_date = archive.date
-        last_date = (archive.date + datetime.timedelta(days=32)).replace(day=1) - datetime.timedelta(1)
-        pd.date_range(
-            start=first_date,
-            end=last_date,
-        )
+        expected, actual, missing = detect_status(archive.date.year, archive.date.month)
+        presence = '[x]' if len(missing) == 0 else '[ ]'
+        print(f'\t{presence} {archive.date.year}-{archive.date.month:02d} ({len(actual)}/{len(expected)})')
 
 
 if __name__ == '__main__':
diff --git a/forerad/persistence.py b/forerad/persistence.py
@@ -5,8 +5,7 @@ import datetime
 import pathlib
 import pytz
 
-TZ_NYC = pytz.timezone('America/New_York')
-TZ_UTC = pytz.timezone('UTC')
+import forerad.utils as utils
 
 class SQLiteStore():
     def __init__(self):
@@ -19,142 +18,46 @@ class SQLiteStore():
         be used to query the started_at and ended_at columns.
         """
         dt = datetime.datetime.combine(date, datetime.datetime.min.time())
-        unix_epoch_str = TZ_NYC.localize(dt)\
-            .astimezone(TZ_UTC)\
+        unix_epoch_str = utils.TZ_NYC.localize(dt)\
+            .astimezone(utils.TZ_UTC)\
             .strftime("%s")
 
         return int(unix_epoch_str)
 
-    def fetch_trips(self, start_date: datetime.date, end_date: datetime.date):
-        """
-        Fetches a dataframe of all trips where started_at >= first_dt and
-        started_at < last_dt. Note that both columns are stored in UTC.
-        """
+
+    def fetch_hourly_volume_rollup(self, start_date: datetime.date, end_date: datetime.date) -> pd.DataFrame:
         query = """
-            SELECT *
-            FROM historical_trips
+            SELECT
+                date(datetime, 'unixepoch') AS datetime,
+                trip_count
+            FROM hourly_volume_rollup
             WHERE
-                started_at >= ? AND
-                started_at < ?
+                datetime >= ? AND
+                datetime < ?
         """
 
         start_dt = self.__localize_date(start_date)
         end_dt = self.__localize_date(end_date)
 
-        return pd.read_sql(query, self.db, params=(start_dt, end_dt))
-
-    def fetch_latest_trip_dt(self) -> datetime.datetime | None:
-        query = "SELECT MAX(started_at) FROM historical_trips"
-        result = pd.read_sql(query, self.db).iat[0, 0]
+        results = pd.read_sql(query, self.db, params=(start_dt, end_dt))
+        return results.set_index('datetime')
 
-        if result is None:
-            return None
 
-        return TZ_UTC\
-            .localize(datetime.datetime.fromtimestamp(int(float(result))))\
-            .astimezone(TZ_NYC)
-
-    def store_trips(self, df):
-        """
-        Stores a dataframe of historical trips, transforming the data depending
-        on the schema it detects
-        """
-        if 'starttime' in df.columns:
-            # Transform v1 schema
-            df = df.rename(columns={
-                'starttime': 'started_at',
-                'stoptime': 'ended_at',
-                'start station id': 'start_station_id',
-                'end station id': 'end_station_id',
-                'start station latitude': 'start_lat',
-                'start station longitude': 'start_lng',
-                'end station latitude': 'end_lat',
-                'end station longitude': 'end_lng',
-            })
-        elif 'Start Time' in df.columns:
-            # This is a weird in-between state of v1 and v2 that starts in the
-            # 2016-10 dataset
-            df = df.rename(columns={
-                'Start Time': 'started_at',
-                'Stop Time': 'ended_at',
-                'Start Station ID': 'start_station_id',
-                'End Station ID': 'end_station_id',
-                'Start Station Latitude': 'start_lat',
-                'Start Station Longitude': 'start_lng',
-                'End Station Latitude': 'end_lat',
-                'End Station Longitude': 'end_lng',
-            })
-
-        if 'rideable_type' not in df.columns:
-            df['rideable_type'] = None
-
-        df = df[[
-            'rideable_type', 'started_at', 'ended_at', 'start_station_id',
-            'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
-        ]]
-        df = df.replace({np.nan: None})
-
-        # Localize timestamps, convert to UTC, then convert to UNIX epoch to
-        # make things speedier when querying in sqlite
-        df['started_at'] = pd.to_datetime(df['started_at'])\
-            .dt.tz_localize(TZ_UTC)\
-            .astype(int) / 10**9
-        df['ended_at'] = pd.to_datetime(df['ended_at'])\
-            .dt.tz_localize(TZ_UTC)\
-            .astype(int) / 10**9
-
-        df.to_sql(
-            name="historical_trips",
-            con=self.db,
-            if_exists='append',
-            index=False,
-        )
-
-    def fetch_daily_volume(self, date: datetime.date) -> int:
-        """
-        Calculate the number of trips taken on a given day.
+    def write_hourly_volume_rollup(self, values: pd.DataFrame):
         """
-        query = """
-            SELECT
-                date(started_at, 'unixepoch'),
-                COUNT(*)
-            FROM historical_trips
-            WHERE
-                started_at >= ? AND
-                started_at < ?
+        Writes a daily volume rollup member to the table
         """
+        values['started_at'] = (values['started_at'].astype(int) / 10**9).astype(int)
 
-        start_dt = self.__localize_date(date)
-        end_dt = self.__localize_date(date + datetime.timedelta(days=1))
-
-        result = pd.read_sql(query, self.db, params=(start_dt, end_dt))
-        return result.iat[0, 1]
-
-    def fetch_daily_volume_rollup(self) -> pd.DataFrame:
-        """
-        Returns a set of days that have an entry in the volume rollup table
-        """
         query = """
-            SELECT
-                date(date, 'unixepoch') AS date,
-                trip_count
-            FROM daily_volume_rollup
-        """
-        df = pd.read_sql(query, self.db)
-        df['date'] = pd.to_datetime(df['date'])
-
-        return df
-
-    def write_daily_volume_rollup(self, values: list[tuple[datetime.date, int]]):
-        """
-        Writes a daily volume rollup member to the table
+            INSERT OR REPLACE INTO hourly_volume_rollup (
+                datetime, trip_count, calculated_at
+            ) VALUES (
+                :started_at, :trip_count, UNIXEPOCH()
+            )
         """
-        to_insert = [(self.__localize_date(d), c) for d, c in values]
 
-        df = pd.DataFrame(to_insert, columns=['date', 'trip_count'])
-        df.to_sql(
-            name="daily_volume_rollup",
-            con=self.db,
-            if_exists="append",
-            index=False,
-        )
+        cur = self.db.cursor()
+        print(values)
+        cur.executemany(query, values.to_dict('records'))
+        self.db.commit()
diff --git a/forerad/scrapers/historical.py b/forerad/scrapers/historical.py
@@ -5,6 +5,7 @@ import io
 import boto3
 import pathlib
 import pandas as pd
+import numpy as np
 import botocore.client as bclient
 from botocore import UNSIGNED
 
@@ -12,7 +13,7 @@ import forerad.utils as utils
 
 log = utils.get_logger()
 ARCHIVE_REGEX = re.compile("^([0-9]{4})([0-9]{2})-citibike-tripdata((.zip$)|(.csv.zip$))")
-CACHE_DIR = pathlib.Path('.forerad-cache')
+CACHE_DIR = pathlib.Path(__file__).parent.parent.parent / pathlib.Path('.forerad-cache')
 TRIP_BUCKET = 'tripdata'
 if not CACHE_DIR.exists():
     CACHE_DIR.mkdir()
@@ -68,8 +69,9 @@ class HistoricalTripArchive():
         Returns a list of HistoricalTripArchive objects for all archives that have already
         been downloaded
         """
-        unfiltered = [cls.from_archive_path(p) for p in CACHE_DIR.glob('*')]
-        return [a for a in unfiltered if a is not None]
+        archives = [cls.from_archive_path(p) for p in CACHE_DIR.glob('*')]
+        archives = [a for a in archives if a is not None]
+        return sorted(archives, key=lambda a: a.date)
     
     def __fetch_cached_blob(self) -> io.BytesIO | None:
         archive_path = CACHE_DIR / self.object_key
@@ -101,6 +103,7 @@ class HistoricalTripArchive():
             blob = io.BytesIO(resp['Body'].read())
             self.__store_blob(blob)
 
+        # Unzip the CSV into a dataframe
         with zipfile.ZipFile(blob, 'r') as zip_archive:
             file_list = zip_archive.namelist()
             csv_name = self.csv_name
@@ -114,7 +117,56 @@ class HistoricalTripArchive():
                 csv_name = file_list[0]
 
             with zip_archive.open(csv_name, 'r') as csv:
-                return pd.read_csv(csv)
+                df = pd.read_csv(csv)
+
+        # Normalize the dataframe
+        if 'starttime' in df.columns:
+            # Transform v1 schema
+            df = df.rename(columns={
+                'starttime': 'started_at',
+                'stoptime': 'ended_at',
+                'start station id': 'start_station_id',
+                'end station id': 'end_station_id',
+                'start station latitude': 'start_lat',
+                'start station longitude': 'start_lng',
+                'end station latitude': 'end_lat',
+                'end station longitude': 'end_lng',
+            })
+        elif 'Start Time' in df.columns:
+            # This is a weird in-between state of v1 and v2 that starts in the
+            # 2016-10 dataset
+            df = df.rename(columns={
+                'Start Time': 'started_at',
+                'Stop Time': 'ended_at',
+                'Start Station ID': 'start_station_id',
+                'End Station ID': 'end_station_id',
+                'Start Station Latitude': 'start_lat',
+                'Start Station Longitude': 'start_lng',
+                'End Station Latitude': 'end_lat',
+                'End Station Longitude': 'end_lng',
+            })
+
+        if 'rideable_type' not in df.columns:
+            df['rideable_type'] = None
+
+        df = df[[
+            'rideable_type', 'started_at', 'ended_at', 'start_station_id',
+            'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
+        ]]
+        df = df.replace({np.nan: None})
+
+        # Localize timestamps, convert to UTC, then convert to UNIX epoch to
+        # make things speedier when querying in sqlite
+        df['started_at'] = pd.to_datetime(df['started_at'])\
+            .dt.tz_localize(utils.TZ_NYC, ambiguous='NaT')\
+            .dt.tz_convert(utils.TZ_UTC)
+        df['ended_at'] = pd.to_datetime(df['ended_at'])\
+            .dt.tz_localize(utils.TZ_NYC, ambiguous='NaT')\
+            .dt.tz_convert(utils.TZ_UTC)
+
+        df.index.name = 'tempid'
+
+        return df.dropna(subset=['started_at', 'ended_at'])
 
     @property
     def is_downloaded(self) -> bool:
diff --git a/forerad/utils.py b/forerad/utils.py
@@ -1,5 +1,10 @@
 import re
 import logging
+import datetime
+import pytz
+
+TZ_NYC = pytz.timezone('America/New_York')
+TZ_UTC = pytz.timezone('UTC')
 
 def parse_month_str(month_str: str) -> tuple[int, int]:
     """
@@ -13,6 +18,13 @@ def parse_month_str(month_str: str) -> tuple[int, int]:
     year, month = match.groups()
     return (int(year), int(month))
 
+def next_month(year, month) -> datetime.date:
+    """
+    Given a year and month, calculate the first day of the next month
+    """
+    date = datetime.date(year, month, 1)
+    return (date + datetime.timedelta(days=32)).replace(day=1)
+
 def get_logger():
     logger = logging.getLogger('forerad')

	forerad Utilities for collecting and analyzing with Citibike data in Python
	Log \| Files \| Refs \| README

M	bin/hourly-volume-rollup	\|	81	++++++++++++++++++++++++++++++++-----------------------------------------------
M	forerad/persistence.py	\|	147	++++++++++++++-----------------------------------------------------------------
M	forerad/scrapers/historical.py	\|	60	++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M	forerad/utils.py	\|	12	++++++++++++