{ "cells": [ { "cell_type": "markdown", "id": "8dd25d03", "metadata": {}, "source": [ "# Cointegration Analysis of Two Stocks" ] }, { "cell_type": "markdown", "id": "5a1ced6f", "metadata": {}, "source": [ "## --- Cell 1: Input symbols ---" ] }, { "cell_type": "code", "execution_count": 1, "id": "cebf9c69", "metadata": {}, "outputs": [], "source": [ "\n", "symbol_a = \"AAPL\"\n", "symbol_b = \"MSFT\"\n", "\n", "API_KEY = \"hV11XYhaase6vxw2qmhh\"" ] }, { "cell_type": "markdown", "id": "d20bf0f1", "metadata": {}, "source": [ "# --- Cell 2: Imports ---" ] }, { "cell_type": "code", "execution_count": 2, "id": "bff2de78", "metadata": {}, "outputs": [], "source": [ "\n", "import yfinance as yf\n", "import pandas as pd\n", "import numpy as np\n", "import statsmodels.api as sm\n", "from statsmodels.tsa.stattools import coint\n", "from statsmodels.tsa.vector_ar.vecm import coint_johansen\n", "import plotly.graph_objects as go\n", "import plotly.subplots as sp\n", "import quandl\n", "\n" ] }, { "cell_type": "markdown", "id": "4c27469d", "metadata": {}, "source": [ "## --- Cell 3: Load Data ---" ] }, { "cell_type": "code", "execution_count": 3, "id": "f79c2d26", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Error downloading AAPL from Nasdaq Data Link (EOD): (Status 403) Something went wrong. Please try again. If you continue to have problems, please contact us at connect@quandl.com.\n", "Error downloading MSFT from Nasdaq Data Link (EOD): (Status 403) Something went wrong. Please try again. If you continue to have problems, please contact us at connect@quandl.com.\n" ] }, { "ename": "ValueError", "evalue": "All objects passed were None", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 16\u001b[39m\n\u001b[32m 13\u001b[39m daily_b = get_nasdaq_close(symbol_b)\n\u001b[32m 15\u001b[39m \u001b[38;5;66;03m# Filter to the last 180 days of overlap\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m16\u001b[39m daily = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mconcat\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdaily_a\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdaily_b\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m)\u001b[49m.dropna().last(\u001b[33m\"\u001b[39m\u001b[33m180D\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 17\u001b[39m daily.columns = [symbol_a, symbol_b]\n\u001b[32m 20\u001b[39m intraday = pd.concat([intraday_a, intraday_b], axis=\u001b[32m1\u001b[39m).dropna()\n", "\u001b[36mFile \u001b[39m\u001b[32m~/.pyenv/python3.12-venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py:382\u001b[39m, in \u001b[36mconcat\u001b[39m\u001b[34m(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)\u001b[39m\n\u001b[32m 379\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m copy \u001b[38;5;129;01mand\u001b[39;00m using_copy_on_write():\n\u001b[32m 380\u001b[39m copy = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m382\u001b[39m op = \u001b[43m_Concatenator\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 383\u001b[39m \u001b[43m \u001b[49m\u001b[43mobjs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 384\u001b[39m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 385\u001b[39m \u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m=\u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 386\u001b[39m \u001b[43m \u001b[49m\u001b[43mjoin\u001b[49m\u001b[43m=\u001b[49m\u001b[43mjoin\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 387\u001b[39m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 388\u001b[39m \u001b[43m \u001b[49m\u001b[43mlevels\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlevels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 389\u001b[39m \u001b[43m \u001b[49m\u001b[43mnames\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnames\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 390\u001b[39m \u001b[43m \u001b[49m\u001b[43mverify_integrity\u001b[49m\u001b[43m=\u001b[49m\u001b[43mverify_integrity\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 391\u001b[39m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 392\u001b[39m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[43m=\u001b[49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 393\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 395\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m op.get_result()\n", "\u001b[36mFile \u001b[39m\u001b[32m~/.pyenv/python3.12-venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py:445\u001b[39m, in \u001b[36m_Concatenator.__init__\u001b[39m\u001b[34m(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)\u001b[39m\n\u001b[32m 442\u001b[39m \u001b[38;5;28mself\u001b[39m.verify_integrity = verify_integrity\n\u001b[32m 443\u001b[39m \u001b[38;5;28mself\u001b[39m.copy = copy\n\u001b[32m--> \u001b[39m\u001b[32m445\u001b[39m objs, keys = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_clean_keys_and_objs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobjs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 447\u001b[39m \u001b[38;5;66;03m# figure out what our result ndim is going to be\u001b[39;00m\n\u001b[32m 448\u001b[39m ndims = \u001b[38;5;28mself\u001b[39m._get_ndims(objs)\n", "\u001b[36mFile \u001b[39m\u001b[32m~/.pyenv/python3.12-venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py:541\u001b[39m, in \u001b[36m_Concatenator._clean_keys_and_objs\u001b[39m\u001b[34m(self, objs, keys)\u001b[39m\n\u001b[32m 538\u001b[39m keys = Index(clean_keys, name=name, dtype=\u001b[38;5;28mgetattr\u001b[39m(keys, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[32m 540\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(objs_list) == \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m541\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mAll objects passed were None\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 543\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m objs_list, keys\n", "\u001b[31mValueError\u001b[39m: All objects passed were None" ] } ], "source": [ "\n", "\n", "quandl.ApiConfig.api_key = API_KEY\n", "\n", "def get_nasdaq_close(symbol):\n", " try:\n", " data = quandl.get(f\"EOD/{symbol}\", start_date=\"2023-01-01\")\n", " return data['Adj_Close']\n", " except Exception as e:\n", " print(f\"Error downloading {symbol} from Nasdaq Data Link (EOD):\", e)\n", " return None\n", "\n", "# Daily (from Nasdaq Data Link)\n", "daily_a = get_nasdaq_close(symbol_a)\n", "daily_b = get_nasdaq_close(symbol_b)\n", "\n", "# Filter to the last 180 days of overlap\n", "daily = pd.concat([daily_a, daily_b], axis=1).dropna().last(\"180D\")\n", "daily.columns = [symbol_a, symbol_b]\n", "\n", "\n", "intraday = pd.concat([intraday_a, intraday_b], axis=1).dropna()\n", "intraday.columns = [symbol_a, symbol_b]\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5b2bf2ce", "metadata": {}, "outputs": [], "source": [ "# --- Cell 4: Cointegration Step 1 (Daily) ---\n", "# Engle-Granger (ADF)\n", "def engle_granger(a, b):\n", " model = sm.OLS(a, sm.add_constant(b)).fit()\n", " spread = model.resid\n", " adf_stat, pvalue, _ = sm.tsa.adfuller(spread)\n", " return pvalue\n", "\n", "# Johansen Test\n", "def johansen_test(df):\n", " result = coint_johansen(df, det_order=0, k_ar_diff=1)\n", " trace_stat = result.lr1\n", " crit_vals = result.cvt[:, 1] # 5% level\n", " return trace_stat[0] > crit_vals[0]\n", "\n", "# Rolling Validation (30-day window)\n", "rolling_pvalues = []\n", "rolling_johansen = []\n", "window = 30\n", "for i in range(window, len(daily)):\n", " a_slice = daily[symbol_a].iloc[i - window:i]\n", " b_slice = daily[symbol_b].iloc[i - window:i]\n", " df_slice = pd.concat([a_slice, b_slice], axis=1)\n", " pval = engle_granger(a_slice, b_slice)\n", " rolling_pvalues.append(pval)\n", " rolling_johansen.append(johansen_test(df_slice))\n", "\n", "mean_pval = np.mean(rolling_pvalues)\n", "passed_johansen_ratio = np.mean(rolling_johansen)\n", "\n", "print(\"\\nDaily Cointegration Results (Rolling 30-day):\")\n", "print(f\"Average Engle-Granger ADF p-value: {mean_pval:.4f}\")\n", "print(f\"Johansen test passed in {passed_johansen_ratio * 100:.1f}% of windows\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1fce1bd8", "metadata": {}, "outputs": [], "source": [ "# --- Cell 5: Cointegration Step 2 (Intraday) ---\n", "# Rolling hedge ratio (60-min window)\n", "window_hr = 60\n", "rolling_hedge_ratios = []\n", "for i in range(window_hr, len(intraday)):\n", " X = intraday[symbol_b].iloc[i - window_hr:i]\n", " y = intraday[symbol_a].iloc[i - window_hr:i]\n", " model = sm.OLS(y, sm.add_constant(X)).fit()\n", " rolling_hedge_ratios.append(model.params[1])\n", "\n", "# Extend hedge ratio to full length\n", "rolling_hedge_ratios = pd.Series(rolling_hedge_ratios, index=intraday.index[window_hr:])\n", "hedge_ratio_full = rolling_hedge_ratios.reindex(intraday.index, method='ffill').fillna(method='bfill')\n", "\n", "# Spread and Z-score\n", "spread = intraday[symbol_a] - hedge_ratio_full * intraday[symbol_b]\n", "zscore = (spread - spread.rolling(60).mean()) / spread.rolling(60).std()\n", "\n", "# --- Cell 6: Trading Signals ---\n", "entry_threshold = 2\n", "exit_threshold = 0.5\n", "transaction_cost = 0.0005 # 5 basis points per leg\n", "\n", "signals = pd.DataFrame(index=intraday.index)\n", "signals['zscore'] = zscore\n", "signals['position'] = 0\n", "signals.loc[signals['zscore'] > entry_threshold, 'position'] = -1 # short A, long B\n", "signals.loc[signals['zscore'] < -entry_threshold, 'position'] = 1 # long A, short B\n", "signals['position'] = signals['position'].where(~signals['zscore'].between(-exit_threshold, exit_threshold), 0)\n", "signals['position'] = signals['position'].ffill().fillna(0)\n", "\n", "# PnL simulation\n", "returns_a = intraday[symbol_a].pct_change().fillna(0)\n", "returns_b = intraday[symbol_b].pct_change().fillna(0)\n", "hedge_ratio_series = hedge_ratio_full.shift(1)\n", "\n", "# Gross PnL\n", "signals['pnl'] = signals['position'].shift(1) * (returns_a - hedge_ratio_series * returns_b)\n", "\n", "# Transaction cost estimation\n", "signals['trades'] = signals['position'].diff().abs()\n", "signals['costs'] = signals['trades'] * transaction_cost * (1 + hedge_ratio_series)\n", "signals['net_pnl'] = signals['pnl'] - signals['costs']\n", "signals['cum_pnl'] = signals['net_pnl'].cumsum()\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "04d10694", "metadata": {}, "outputs": [], "source": [ "# --- Cell 7: Export Results ---\n", "signals.to_csv(\"signals_and_pnl.csv\")\n", "print(\"Exported: signals_and_pnl.csv\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "57df70b6", "metadata": {}, "outputs": [], "source": [ "# --- Cell 8: Visualization ---\n", "fig = sp.make_subplots(rows=3, cols=1, shared_xaxes=True,\n", " subplot_titles=(\"Price Series (1-min)\", \"Spread Z-Score\", \"Cumulative PnL (Net)\",))\n", "\n", "# Price series\n", "fig.add_trace(go.Scatter(x=intraday.index, y=intraday[symbol_a], name=symbol_a), row=1, col=1)\n", "fig.add_trace(go.Scatter(x=intraday.index, y=intraday[symbol_b], name=symbol_b), row=1, col=1)\n", "\n", "# Z-score\n", "fig.add_trace(go.Scatter(x=signals.index, y=signals['zscore'], name=\"Z-Score\"), row=2, col=1)\n", "fig.add_hline(y=entry_threshold, line_dash=\"dot\", row=2, col=1)\n", "fig.add_hline(y=-entry_threshold, line_dash=\"dot\", row=2, col=1)\n", "fig.add_hline(y=0, line_dash=\"dash\", row=2, col=1)\n", "\n", "# PnL\n", "fig.add_trace(go.Scatter(x=signals.index, y=signals['cum_pnl'], name=\"Cumulative PnL (Net)\"), row=3, col=1)\n", "\n", "fig.update_layout(title=f\"Cointegration Strategy: {symbol_a} vs {symbol_b} (1-Minute Data)\", height=950)\n", "fig.show()\n" ] } ], "metadata": { "kernelspec": { "display_name": "python3.12-venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }