From e86c3abc5312f65f86fdb8bf051061019d810ea1 Mon Sep 17 00:00:00 2001 From: Oleg Sheynin Date: Sun, 20 Jul 2025 18:05:13 -0400 Subject: [PATCH] initial --- .gitignore | 2 + notebooks/stocks_cointegration.ipynb | 285 +++++++++++++++++++++++++++ requirements.txt | 199 +++++++++++++++++++ scripts/load_equity_pair_daily.sh | 145 ++++++++++++++ 4 files changed, 631 insertions(+) create mode 100644 .gitignore create mode 100644 notebooks/stocks_cointegration.ipynb create mode 100644 requirements.txt create mode 100755 scripts/load_equity_pair_daily.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7c5a162 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.history/ +daily_data/ diff --git a/notebooks/stocks_cointegration.ipynb b/notebooks/stocks_cointegration.ipynb new file mode 100644 index 0000000..3626f50 --- /dev/null +++ b/notebooks/stocks_cointegration.ipynb @@ -0,0 +1,285 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8dd25d03", + "metadata": {}, + "source": [ + "# Cointegration Analysis of Two Stocks" + ] + }, + { + "cell_type": "markdown", + "id": "5a1ced6f", + "metadata": {}, + "source": [ + "## --- Cell 1: Input symbols ---" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "cebf9c69", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "symbol_a = \"AAPL\"\n", + "symbol_b = \"MSFT\"\n", + "\n", + "API_KEY = \"hV11XYhaase6vxw2qmhh\"" + ] + }, + { + "cell_type": "markdown", + "id": "d20bf0f1", + "metadata": {}, + "source": [ + "# --- Cell 2: Imports ---" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bff2de78", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import yfinance as yf\n", + "import pandas as pd\n", + "import numpy as np\n", + "import statsmodels.api as sm\n", + "from statsmodels.tsa.stattools import coint\n", + "from statsmodels.tsa.vector_ar.vecm import coint_johansen\n", + "import plotly.graph_objects as go\n", + "import plotly.subplots as sp\n", + "import quandl\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "4c27469d", + "metadata": {}, + "source": [ + "## --- Cell 3: Load Data ---" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f79c2d26", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error downloading AAPL from Nasdaq Data Link (EOD): (Status 403) Something went wrong. Please try again. If you continue to have problems, please contact us at connect@quandl.com.\n", + "Error downloading MSFT from Nasdaq Data Link (EOD): (Status 403) Something went wrong. Please try again. If you continue to have problems, please contact us at connect@quandl.com.\n" + ] + }, + { + "ename": "ValueError", + "evalue": "All objects passed were None", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 16\u001b[39m\n\u001b[32m 13\u001b[39m daily_b = get_nasdaq_close(symbol_b)\n\u001b[32m 15\u001b[39m \u001b[38;5;66;03m# Filter to the last 180 days of overlap\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m16\u001b[39m daily = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mconcat\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdaily_a\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdaily_b\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m)\u001b[49m.dropna().last(\u001b[33m\"\u001b[39m\u001b[33m180D\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 17\u001b[39m daily.columns = [symbol_a, symbol_b]\n\u001b[32m 20\u001b[39m intraday = pd.concat([intraday_a, intraday_b], axis=\u001b[32m1\u001b[39m).dropna()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.pyenv/python3.12-venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py:382\u001b[39m, in \u001b[36mconcat\u001b[39m\u001b[34m(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)\u001b[39m\n\u001b[32m 379\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m copy \u001b[38;5;129;01mand\u001b[39;00m using_copy_on_write():\n\u001b[32m 380\u001b[39m copy = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m382\u001b[39m op = \u001b[43m_Concatenator\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 383\u001b[39m \u001b[43m \u001b[49m\u001b[43mobjs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 384\u001b[39m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 385\u001b[39m \u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m=\u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 386\u001b[39m \u001b[43m \u001b[49m\u001b[43mjoin\u001b[49m\u001b[43m=\u001b[49m\u001b[43mjoin\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 387\u001b[39m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 388\u001b[39m \u001b[43m \u001b[49m\u001b[43mlevels\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlevels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 389\u001b[39m \u001b[43m \u001b[49m\u001b[43mnames\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnames\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 390\u001b[39m \u001b[43m \u001b[49m\u001b[43mverify_integrity\u001b[49m\u001b[43m=\u001b[49m\u001b[43mverify_integrity\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 391\u001b[39m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 392\u001b[39m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[43m=\u001b[49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 393\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 395\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m op.get_result()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.pyenv/python3.12-venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py:445\u001b[39m, in \u001b[36m_Concatenator.__init__\u001b[39m\u001b[34m(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)\u001b[39m\n\u001b[32m 442\u001b[39m \u001b[38;5;28mself\u001b[39m.verify_integrity = verify_integrity\n\u001b[32m 443\u001b[39m \u001b[38;5;28mself\u001b[39m.copy = copy\n\u001b[32m--> \u001b[39m\u001b[32m445\u001b[39m objs, keys = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_clean_keys_and_objs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobjs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 447\u001b[39m \u001b[38;5;66;03m# figure out what our result ndim is going to be\u001b[39;00m\n\u001b[32m 448\u001b[39m ndims = \u001b[38;5;28mself\u001b[39m._get_ndims(objs)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.pyenv/python3.12-venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py:541\u001b[39m, in \u001b[36m_Concatenator._clean_keys_and_objs\u001b[39m\u001b[34m(self, objs, keys)\u001b[39m\n\u001b[32m 538\u001b[39m keys = Index(clean_keys, name=name, dtype=\u001b[38;5;28mgetattr\u001b[39m(keys, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[32m 540\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(objs_list) == \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m541\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mAll objects passed were None\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 543\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m objs_list, keys\n", + "\u001b[31mValueError\u001b[39m: All objects passed were None" + ] + } + ], + "source": [ + "\n", + "\n", + "quandl.ApiConfig.api_key = API_KEY\n", + "\n", + "def get_nasdaq_close(symbol):\n", + " try:\n", + " data = quandl.get(f\"EOD/{symbol}\", start_date=\"2023-01-01\")\n", + " return data['Adj_Close']\n", + " except Exception as e:\n", + " print(f\"Error downloading {symbol} from Nasdaq Data Link (EOD):\", e)\n", + " return None\n", + "\n", + "# Daily (from Nasdaq Data Link)\n", + "daily_a = get_nasdaq_close(symbol_a)\n", + "daily_b = get_nasdaq_close(symbol_b)\n", + "\n", + "# Filter to the last 180 days of overlap\n", + "daily = pd.concat([daily_a, daily_b], axis=1).dropna().last(\"180D\")\n", + "daily.columns = [symbol_a, symbol_b]\n", + "\n", + "\n", + "intraday = pd.concat([intraday_a, intraday_b], axis=1).dropna()\n", + "intraday.columns = [symbol_a, symbol_b]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b2bf2ce", + "metadata": {}, + "outputs": [], + "source": [ + "# --- Cell 4: Cointegration Step 1 (Daily) ---\n", + "# Engle-Granger (ADF)\n", + "def engle_granger(a, b):\n", + " model = sm.OLS(a, sm.add_constant(b)).fit()\n", + " spread = model.resid\n", + " adf_stat, pvalue, _ = sm.tsa.adfuller(spread)\n", + " return pvalue\n", + "\n", + "# Johansen Test\n", + "def johansen_test(df):\n", + " result = coint_johansen(df, det_order=0, k_ar_diff=1)\n", + " trace_stat = result.lr1\n", + " crit_vals = result.cvt[:, 1] # 5% level\n", + " return trace_stat[0] > crit_vals[0]\n", + "\n", + "# Rolling Validation (30-day window)\n", + "rolling_pvalues = []\n", + "rolling_johansen = []\n", + "window = 30\n", + "for i in range(window, len(daily)):\n", + " a_slice = daily[symbol_a].iloc[i - window:i]\n", + " b_slice = daily[symbol_b].iloc[i - window:i]\n", + " df_slice = pd.concat([a_slice, b_slice], axis=1)\n", + " pval = engle_granger(a_slice, b_slice)\n", + " rolling_pvalues.append(pval)\n", + " rolling_johansen.append(johansen_test(df_slice))\n", + "\n", + "mean_pval = np.mean(rolling_pvalues)\n", + "passed_johansen_ratio = np.mean(rolling_johansen)\n", + "\n", + "print(\"\\nDaily Cointegration Results (Rolling 30-day):\")\n", + "print(f\"Average Engle-Granger ADF p-value: {mean_pval:.4f}\")\n", + "print(f\"Johansen test passed in {passed_johansen_ratio * 100:.1f}% of windows\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fce1bd8", + "metadata": {}, + "outputs": [], + "source": [ + "# --- Cell 5: Cointegration Step 2 (Intraday) ---\n", + "# Rolling hedge ratio (60-min window)\n", + "window_hr = 60\n", + "rolling_hedge_ratios = []\n", + "for i in range(window_hr, len(intraday)):\n", + " X = intraday[symbol_b].iloc[i - window_hr:i]\n", + " y = intraday[symbol_a].iloc[i - window_hr:i]\n", + " model = sm.OLS(y, sm.add_constant(X)).fit()\n", + " rolling_hedge_ratios.append(model.params[1])\n", + "\n", + "# Extend hedge ratio to full length\n", + "rolling_hedge_ratios = pd.Series(rolling_hedge_ratios, index=intraday.index[window_hr:])\n", + "hedge_ratio_full = rolling_hedge_ratios.reindex(intraday.index, method='ffill').fillna(method='bfill')\n", + "\n", + "# Spread and Z-score\n", + "spread = intraday[symbol_a] - hedge_ratio_full * intraday[symbol_b]\n", + "zscore = (spread - spread.rolling(60).mean()) / spread.rolling(60).std()\n", + "\n", + "# --- Cell 6: Trading Signals ---\n", + "entry_threshold = 2\n", + "exit_threshold = 0.5\n", + "transaction_cost = 0.0005 # 5 basis points per leg\n", + "\n", + "signals = pd.DataFrame(index=intraday.index)\n", + "signals['zscore'] = zscore\n", + "signals['position'] = 0\n", + "signals.loc[signals['zscore'] > entry_threshold, 'position'] = -1 # short A, long B\n", + "signals.loc[signals['zscore'] < -entry_threshold, 'position'] = 1 # long A, short B\n", + "signals['position'] = signals['position'].where(~signals['zscore'].between(-exit_threshold, exit_threshold), 0)\n", + "signals['position'] = signals['position'].ffill().fillna(0)\n", + "\n", + "# PnL simulation\n", + "returns_a = intraday[symbol_a].pct_change().fillna(0)\n", + "returns_b = intraday[symbol_b].pct_change().fillna(0)\n", + "hedge_ratio_series = hedge_ratio_full.shift(1)\n", + "\n", + "# Gross PnL\n", + "signals['pnl'] = signals['position'].shift(1) * (returns_a - hedge_ratio_series * returns_b)\n", + "\n", + "# Transaction cost estimation\n", + "signals['trades'] = signals['position'].diff().abs()\n", + "signals['costs'] = signals['trades'] * transaction_cost * (1 + hedge_ratio_series)\n", + "signals['net_pnl'] = signals['pnl'] - signals['costs']\n", + "signals['cum_pnl'] = signals['net_pnl'].cumsum()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04d10694", + "metadata": {}, + "outputs": [], + "source": [ + "# --- Cell 7: Export Results ---\n", + "signals.to_csv(\"signals_and_pnl.csv\")\n", + "print(\"Exported: signals_and_pnl.csv\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57df70b6", + "metadata": {}, + "outputs": [], + "source": [ + "# --- Cell 8: Visualization ---\n", + "fig = sp.make_subplots(rows=3, cols=1, shared_xaxes=True,\n", + " subplot_titles=(\"Price Series (1-min)\", \"Spread Z-Score\", \"Cumulative PnL (Net)\",))\n", + "\n", + "# Price series\n", + "fig.add_trace(go.Scatter(x=intraday.index, y=intraday[symbol_a], name=symbol_a), row=1, col=1)\n", + "fig.add_trace(go.Scatter(x=intraday.index, y=intraday[symbol_b], name=symbol_b), row=1, col=1)\n", + "\n", + "# Z-score\n", + "fig.add_trace(go.Scatter(x=signals.index, y=signals['zscore'], name=\"Z-Score\"), row=2, col=1)\n", + "fig.add_hline(y=entry_threshold, line_dash=\"dot\", row=2, col=1)\n", + "fig.add_hline(y=-entry_threshold, line_dash=\"dot\", row=2, col=1)\n", + "fig.add_hline(y=0, line_dash=\"dash\", row=2, col=1)\n", + "\n", + "# PnL\n", + "fig.add_trace(go.Scatter(x=signals.index, y=signals['cum_pnl'], name=\"Cumulative PnL (Net)\"), row=3, col=1)\n", + "\n", + "fig.update_layout(title=f\"Cointegration Strategy: {symbol_a} vs {symbol_b} (1-Minute Data)\", height=950)\n", + "fig.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3.12-venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6a1e080 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,199 @@ +aiohttp>=3.8.4 +aiosignal>=1.3.1 +async-timeout>=4.0.2 +attrs>=21.2.0 +beautifulsoup4>=4.10.0 +black>=23.3.0 +flake8>=6.0.0 +certifi>=2020.6.20 +chardet>=4.0.0 +charset-normalizer>=3.1.0 +click>=8.0.3 +colorama>=0.4.4 +configobj>=5.0.6 +cryptography>=3.4.8 +distro>=1.7.0 +docker>=5.0.3 +dockerpty>=0.4.1 +docopt>=0.6.2 +eyeD3>=0.8.10 +filelock>=3.6.0 +frozenlist>=1.3.3 +grpcio>=1.30.2 +hjson>=3.0.2 +html5lib>=1.1 +httplib2>=0.20.2 +idna>=3.3 +ipython>=8.18.1 +ipywidgets>=8.1.1 +ifaddr>=0.1.7 +IMDbPY>=2021.4.18 +ipykernel>=6.29.5 +jeepney>=0.7.1 +jsonschema>=3.2.0 +jupyter>=1.0.0 +keyring>=23.5.0 +launchpadlib>=1.10.16 +lazr.restfulclient>=0.14.4 +lazr.uri>=1.0.6 +lxml>=4.8.0 +Mako>=1.1.3 +Markdown>=3.3.6 +MarkupSafe>=2.0.1 +matplotlib>=3.10.3 +more-itertools>=8.10.0 +multidict>=6.0.4 +mypy>=0.942 +mypy-extensions>=0.4.3 +nbformat>=5.10.2 +netaddr>=0.8.0 +######### netifaces>=0.11.0 +numpy>=1.26.4,<2.3.0 +oauthlib>=3.2.0 +packaging>=23.1 +pandas>=2.2.3 +pathspec>=0.11.1 +pexpect>=4.8.0 +Pillow>=9.0.1 +platformdirs>=3.2.0 +plotly>=5.19.0 +protobuf>=3.12.4 +psutil>=5.9.0 +ptyprocess>=0.7.0 +pycurl>=7.44.1 +pyelftools>=0.27 +Pygments>=2.11.2 +pyparsing>=2.4.7 +pyrsistent>=0.18.1 +python-debian>=0.1.43 #+ubuntu1.1 +python-dotenv>=0.19.2 +python-magic>=0.4.24 +python-xlib>=0.29 +pyxdg>=0.27 +PyYAML>=6.0 +reportlab>=3.6.8 +requests>=2.25.1 +requests-file>=1.5.1 +seaborn>=0.13.2 +SecretStorage>=3.3.1 +setproctitle>=1.2.2 +six>=1.16.0 +soupsieve>=2.3.1 +ssh-import-id>=5.11 +statsmodels>=0.14.4 +texttable>=1.6.4 +tldextract>=3.1.2 +tomli>=1.2.2 +######## typed-ast>=1.4.3 +types-aiofiles>=0.1 +types-annoy>=1.17 +types-appdirs>=1.4 +types-atomicwrites>=1.4 +types-aws-xray-sdk>=2.8 +types-babel>=2.9 +types-backports-abc>=0.5 +types-backports.ssl-match-hostname>=3.7 +types-beautifulsoup4>=4.10 +types-bleach>=4.1 +types-boto>=2.49 +types-braintree>=4.11 +types-cachetools>=4.2 +types-caldav>=0.8 +types-certifi>=2020.4 +types-characteristic>=14.3 +types-chardet>=4.0 +types-click>=7.1 +types-click-spinner>=0.1 +types-colorama>=0.4 +types-commonmark>=0.9 +types-contextvars>=0.1 +types-croniter>=1.0 +types-cryptography>=3.3 +types-dataclasses>=0.1 +types-dateparser>=1.0 +types-DateTimeRange>=0.1 +types-decorator>=0.1 +types-Deprecated>=1.2 +types-docopt>=0.6 +types-docutils>=0.17 +types-editdistance>=0.5 +types-emoji>=1.2 +types-entrypoints>=0.3 +types-enum34>=1.1 +types-filelock>=3.2 +types-first>=2.0 +types-Flask>=1.1 +types-freezegun>=1.1 +types-frozendict>=0.1 +types-futures>=3.3 +types-html5lib>=1.1 +types-httplib2>=0.19 +types-humanfriendly>=9.2 +types-ipaddress>=1.0 +types-itsdangerous>=1.1 +types-JACK-Client>=0.1 +types-Jinja2>=2.11 +types-jmespath>=0.10 +types-jsonschema>=3.2 +types-Markdown>=3.3 +types-MarkupSafe>=1.1 +types-mock>=4.0 +types-mypy-extensions>=0.4 +types-mysqlclient>=2.0 +types-oauthlib>=3.1 +types-orjson>=3.6 +types-paramiko>=2.7 +types-Pillow>=8.3 +types-polib>=1.1 +types-prettytable>=2.1 +types-protobuf>=3.17 +types-psutil>=5.8 +types-psycopg2>=2.9 +types-pyaudio>=0.2 +types-pycurl>=0.1 +types-pyfarmhash>=0.2 +types-Pygments>=2.9 +types-PyMySQL>=1.0 +types-pyOpenSSL>=20.0 +types-pyRFC3339>=0.1 +types-pysftp>=0.2 +types-pytest-lazy-fixture>=0.6 +types-python-dateutil>=2.8 +types-python-gflags>=3.1 +types-python-nmap>=0.6 +types-python-slugify>=5.0 +types-pytz>=2021.1 +types-pyvmomi>=7.0 +types-PyYAML>=5.4 +types-redis>=3.5 +types-requests>=2.25 +types-retry>=0.9 +types-selenium>=3.141 +types-Send2Trash>=1.8 +types-setuptools>=57.4 +types-simplejson>=3.17 +types-singledispatch>=3.7 +types-six>=1.16 +types-slumber>=0.7 +types-stripe>=2.59 +types-tabulate>=0.8 +types-termcolor>=1.1 +types-toml>=0.10 +types-toposort>=1.6 +types-ttkthemes>=3.2 +types-typed-ast>=1.4 +types-tzlocal>=0.1 +types-ujson>=0.1 +types-vobject>=0.9 +types-waitress>=0.1 +types-Werkzeug>=1.0 +types-xxhash>=2.0 +typing-extensions>=3.10.0.2 +Unidecode>=1.3.3 +urllib3>=1.26.5 +wadllib>=1.3.6 +webencodings>=0.5.1 +websocket-client>=1.2.3 +yarl>=1.9.1 +yfinance>=0.2.65 +zipp>=1.0.0 diff --git a/scripts/load_equity_pair_daily.sh b/scripts/load_equity_pair_daily.sh new file mode 100755 index 0000000..82263ef --- /dev/null +++ b/scripts/load_equity_pair_daily.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash + +# Usage: ./scripts/load_equity_pair_daily.sh -A GS -B DIA -f 20241201 -t 20250131 -T ./daily_md + +usage() { + echo "Usage: $0 -A -B -f -t -T " + exit 1 +} +# ---------------- cmdline +while getopts "A:B:f:t:T:h" opt; do + case ${opt} in + h) + usage + ;; + A ) + symbolA=$OPTARG + ;; + B ) + symbolB=$OPTARG + ;; + f ) + from_date=$OPTARG + ;; + t ) + to_date=$OPTARG + ;; + T ) + target_dir=$OPTARG + ;; + \? ) + echo "Invalid option: -$OPTARG" >&2 + usage + ;; + : ) + echo "Option -$OPTARG requires an argument." >&2 + usage + ;; + esac +done +# ---------------- cmdline +if [ -z ${symbolA} ] || [ -z ${symbolB} ] || [ -z ${from_date} ] ||[ -z ${to_date} ] || [ -z ${target_dir} ]; then + usage +fi +SourceRoot=cvtt@hs01.cvtt.vpn:/mnt/usb1/md_archive/equity/alpaca_md +TargetFile=$(realpath ${target_dir})/${from_date}-${to_date}.${symbolA}-${symbolB}.daily.db + +mkdir -p ${target_dir} || exit 1 + +temp_dir=$(mktemp -d) +pushd ${temp_dir} +clean_temp() { + popd + rm -rf ${temp_dir} +} +trap clean_temp EXIT + +echo " ------ Downloading..." + +load_symbol() { + # Loop over dates from from_date to to_date + symbol=${1} + current_date="$from_date" + while [[ "$current_date" -le "$to_date" ]]; do + year=${current_date:0:4} + symb_initial=${symbol:0:1} + remote_path="${SourceRoot}/${year}/${symb_initial}/${symbol}/${current_date}.${symbol}.alpaca_1m_bars.db.gz" + + echo "Fetching: $remote_path" + Cmd="rsync -avz ${remote_path} ${temp_dir}/ || echo \"Missing: ${current_date}\"" + echo $Cmd + eval $Cmd + + # Increment date + current_date=$(date -d "$current_date +1 day" +"%Y%m%d") + done +} + +load_symbol ${symbolA} +load_symbol ${symbolB} + +gunzip *.gz +ls -l ${temp_dir} + +sqlite3 "$TargetFile" < time('16:00:00'); + +EOF + sqlite3 $TargetFile <