use unix timestamps for better speed - forerad - Utilities for collecting and analyzing with Citibike data in Python

commit 137dcd04ffb7d6c091f2a98b0e64892806e358ba
parent 430aff814eff63bde364c8db6439aaec13193014
Author: Steve Gattuso <steve@stevegattuso.me>
Date:   Sun,  5 Nov 2023 13:44:47 +0100

use unix timestamps for better speed

Diffstat:
M forerad/persistence.py  | 24 ++++++++++++++++++++++--
M migrations/20231030_01_HoM9F-create-the-initial-schema.sql  | 4 ++--

2 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/forerad/persistence.py b/forerad/persistence.py
@@ -3,6 +3,10 @@ import pandas as pd
 import numpy as np
 import datetime
 import pathlib
+import pytz
+
+TZ_NYC = pytz.timezone('America/New_York')
+TZ_UTC = pytz.timezone('UTC')
 
 class SQLiteStore():
     def __init__(self):
@@ -14,7 +18,6 @@ class SQLiteStore():
         Fetches a dataframe of all trips where started_at >= first_dt and
         started_at < last_dt. Note that both columns are stored in UTC.
         """
-
         query = """
             SELECT *
             FROM historical_trips
@@ -23,7 +26,15 @@ class SQLiteStore():
                 started_at < ?
         """
 
-        return pd.read_sql(query, self.db, params=(first_dt, last_dt))
+        # Localize, convert, and format as UNIX epoch
+        start = TZ_NYC.localize(first_dt)\
+            .astimezone(TZ_UTC)\
+            .strftime('%s')
+        end = TZ_NYC.localize(last_dt)\
+            .astimezone(TZ_UTC)\
+            .strftime('%s')
+
+        return pd.read_sql(query, self.db, params=(start, end))
 
     def __store_formatted(self, df):
         """
@@ -36,6 +47,15 @@ class SQLiteStore():
         ]]
         df = df.replace({np.nan: None})
 
+        # Localize timestamps, convert to UTC, then convert to UNIX epoch to
+        # make things speedier when querying in sqlite
+        df['started_at'] = pd.to_datetime(df['started_at'])\
+            .dt.tz_localize(TZ_UTC)\
+            .astype(int) / 10**9
+        df['ended_at'] = pd.to_datetime(df['ended_at'])\
+            .dt.tz_localize(TZ_UTC)\
+            .astype(int) / 10**9
+
         df.to_sql(
             name="historical_trips",
             con=self.db,
diff --git a/migrations/20231030_01_HoM9F-create-the-initial-schema.sql b/migrations/20231030_01_HoM9F-create-the-initial-schema.sql
@@ -3,8 +3,8 @@
 CREATE TABLE historical_trips (
     ride_id INTEGER PRIMARY KEY,
     rideable_type TEXT,
-    started_at TEXT NOT NULL,
-    ended_at TEXT NOT NULL,
+    started_at NUMERIC NOT NULL,
+    ended_at NUMERIC NOT NULL,
     start_lat NUMERIC,
     start_lng NUMERIC,
     end_lat NUMERIC,

	forerad Utilities for collecting and analyzing with Citibike data in Python
	Log \| Files \| Refs \| README

M	forerad/persistence.py	\|	24	++++++++++++++++++++++--
M	migrations/20231030_01_HoM9F-create-the-initial-schema.sql	\|	4	++--