Universal-Pairs-Trading-System/data_handler.py at main · XanderRobbins/Universal-Pairs-Trading-System · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
"""
Data acquisition and preprocessing for pairs trading strategies.
Supports any two cointegrated assets (stocks, ETFs, futures, forex, etc.)
"""
import pandas as pd
import numpy as np
import yfinance as yf
from typing import Tuple, Optional, Dict, Literal
from statsmodels.tsa.stattools import adfuller, coint
from scipy import stats
import warnings
warnings.filterwarnings('ignore')


class PairsDataHandler:
    """
    Data handler for pairs trading strategies.

    Fetches and aligns two financial instruments, computes spreads,
    validates cointegration/stationarity, and estimates hedge ratios.

    Example:
        >>> handler = PairsDataHandler(config, asset1='SPY', asset2='QQQ')
        >>> df = handler.fetch_data()
        >>> handler.test_cointegration()
    """

    def __init__(self, config, asset1_ticker: str, asset2_ticker: str,
                 pair_name: Optional[str] = None):
        self.config = config
        self.asset1_ticker = asset1_ticker
        self.asset2_ticker = asset2_ticker
        self.pair_name = pair_name or f"{asset1_ticker}-{asset2_ticker}"

        self.asset1_data = None
        self.asset2_data = None
        self.df = None
        self.validation_results = {}

    def fetch_data(self, verbose: bool = True) -> pd.DataFrame:
        """
        Fetch historical data for both assets and return a cleaned, aligned DataFrame.
        """
        if verbose:
            print(f"Fetching data for {self.pair_name}...")

        try:
            if verbose:
                print(f"  Downloading {self.asset1_ticker}...")
            asset1_raw = yf.download(
                self.asset1_ticker,
                start=self.config.start_date,
                end=self.config.end_date,
                progress=False,
                auto_adjust=True
            )

            if verbose:
                print(f"  Downloading {self.asset2_ticker}...")
            asset2_raw = yf.download(
                self.asset2_ticker,
                start=self.config.start_date,
                end=self.config.end_date,
                progress=False,
                auto_adjust=True
            )

            asset1_clean = self._clean_dataframe(asset1_raw)
            asset2_clean = self._clean_dataframe(asset2_raw)

            df = pd.DataFrame({
                'Asset1_Close': asset1_clean['Close'],
                'Asset1_High': asset1_clean['High'],
                'Asset1_Low': asset1_clean['Low'],
                'Asset1_Volume': asset1_clean['Volume'],
                'Asset2_Close': asset2_clean['Close'],
                'Asset2_High': asset2_clean['High'],
                'Asset2_Low': asset2_clean['Low'],
                'Asset2_Volume': asset2_clean['Volume']
            }).dropna()

            df = self._quality_filter(df)

            self.df = df
            self.asset1_data = asset1_clean
            self.asset2_data = asset2_clean

            if verbose:
                print(f"  Loaded {len(df)} trading days "
                      f"({df.index[0].date()} to {df.index[-1].date()})")
                print(f"  {self.asset1_ticker}: ${df['Asset1_Close'].min():.2f} - ${df['Asset1_Close'].max():.2f}")
                print(f"  {self.asset2_ticker}: ${df['Asset2_Close'].min():.2f} - ${df['Asset2_Close'].max():.2f}")

            return df

        except Exception as e:
            raise RuntimeError(f"Failed to fetch data for {self.pair_name}: {str(e)}")

    def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """Drop multi-index columns, forward-fill small gaps, remove NaNs."""
        if isinstance(df.columns, pd.MultiIndex):
            df = df.copy()
            df.columns = df.columns.droplevel(1)

        df = df.replace(0, np.nan)
        df = df.ffill(limit=3)
        df = df.dropna()

        return df

    def _quality_filter(self, df: pd.DataFrame) -> pd.DataFrame:
        """Remove rows with prices below threshold or extreme z-score outliers."""
        original_len = len(df)

        df = df[
            (df['Asset1_Close'] > self.config.data.min_price_threshold) &
            (df['Asset2_Close'] > self.config.data.min_price_threshold)
        ]

        for col in ['Asset1_Close', 'Asset2_Close']:
            z_scores = np.abs(stats.zscore(df[col]))
            df = df[z_scores < self.config.data.outlier_std_threshold]

        if 'Asset1_Volume' in df.columns and 'Asset2_Volume' in df.columns:
            df = df[(df['Asset1_Volume'] > 0) & (df['Asset2_Volume'] > 0)]

        removed = original_len - len(df)
        if removed > 0:
            print(f"  Filtered {removed} low-quality rows ({removed/original_len*100:.2f}%)")

        return df

    def test_cointegration(self, verbose: bool = True) -> Dict[str, float]:
        """
        Run Engle-Granger and Johansen cointegration tests on the loaded pair.
        Returns a dict with p-value and boolean cointegration status.
        """
        if verbose:
            print(f"\nStatistical validation: {self.pair_name}")

        asset1 = self.df['Asset1_Close'].values
        asset2 = self.df['Asset2_Close'].values

        _, coint_pval, _ = coint(asset1, asset2)
        is_cointegrated = coint_pval < self.config.data.cointegration_pvalue

        if verbose:
            print(f"  Engle-Granger p-value: {coint_pval:.6f} "
                  f"({'cointegrated' if is_cointegrated else 'NOT cointegrated'})")

        results = {
            'coint_pvalue': coint_pval,
            'is_cointegrated': is_cointegrated
        }

        self.validation_results.update(results)

        johansen_results = self.test_johansen_cointegration(verbose=verbose)

        both_cointegrated = is_cointegrated and johansen_results['johansen_is_cointegrated']

        if verbose:
            print(f"  Consensus (both tests): {'cointegrated' if both_cointegrated else 'MIXED / NOT cointegrated'}")

        return results

    def test_johansen_cointegration(self, verbose: bool = True) -> dict:
        """
        Johansen cointegration test for the pair.

        Unlike Engle-Granger (which runs OLS regression), Johansen tests for
        cointegration directly in a VAR framework and handles the case where
        both variables may be endogenous. For a bivariate pair, it tests
        whether the rank of the cointegrating matrix is >= 1.

        Returns dict with trace statistic, max-eigenvalue statistic,
        critical values at 90/95/99%, and a boolean is_cointegrated.
        """
        from statsmodels.tsa.vector_ar.vecm import coint_johansen

        if self.df is None:
            raise ValueError("Data not loaded. Call fetch_data() first.")

        data = self.df[['Asset1_Close', 'Asset2_Close']].values

        # det_order=0: assume constant in cointegrating relationship (no trend)
        # k_ar_diff=1: use 1 lag difference (standard for daily price data)
        result = coint_johansen(data, det_order=0, k_ar_diff=1)

        trace_stat = result.lr1[0]
        trace_crit_90 = result.cvt[0, 0]
        trace_crit_95 = result.cvt[0, 1]
        trace_crit_99 = result.cvt[0, 2]

        max_eig_stat = result.lr2[0]
        max_eig_crit_90 = result.cvm[0, 0]
        max_eig_crit_95 = result.cvm[0, 1]
        max_eig_crit_99 = result.cvm[0, 2]

        is_cointegrated = trace_stat > trace_crit_95

        if verbose:
            print(f"  Johansen trace stat: {trace_stat:.4f} "
                  f"(crit 95%={trace_crit_95:.3f}) "
                  f"-> {'cointegrated' if is_cointegrated else 'NOT cointegrated'}")

        output = {
            'johansen_trace_stat': trace_stat,
            'johansen_trace_crit_95': trace_crit_95,
            'johansen_max_eig_stat': max_eig_stat,
            'johansen_max_eig_crit_95': max_eig_crit_95,
            'johansen_is_cointegrated': is_cointegrated,
        }

        self.validation_results.update(output)
        return output

    def compute_spread(self, method: Literal['log', 'simple', 'ratio'] = 'log',
                       hedge_ratio: Optional[float] = None) -> pd.Series:
        """
        Compute spread between Asset1 and Asset2.

        Args:
            method:
                - 'log': log(Asset1) - beta*log(Asset2) [default]
                - 'simple': Asset1 - beta*Asset2
                - 'ratio': Asset1 / Asset2
            hedge_ratio: Beta to apply. Defaults to 1.0 if None.

        Returns:
            pd.Series: Computed spread
        """
        if self.df is None:
            raise ValueError("Data not loaded. Call fetch_data() first.")

        if method == 'log':
            log1 = np.log(self.df['Asset1_Close'])
            log2 = np.log(self.df['Asset2_Close'])
            beta = hedge_ratio if hedge_ratio is not None else 1.0
            spread = log1 - beta * log2
        elif method == 'simple':
            beta = hedge_ratio if hedge_ratio is not None else 1.0
            spread = self.df['Asset1_Close'] - beta * self.df['Asset2_Close']
        elif method == 'ratio':
            spread = self.df['Asset1_Close'] / self.df['Asset2_Close']
        else:
            raise ValueError(f"Unknown method: {method}. Use 'log', 'simple', or 'ratio'")

        return spread

    def test_stationarity(self, spread: pd.Series, verbose: bool = True) -> Dict[str, float]:
        """
        Augmented Dickey-Fuller test for spread stationarity.

        Returns dict with ADF statistic, p-value, and boolean stationarity result.
        """
        result = adfuller(spread.dropna(), autolag='AIC')
        adf_stat = result[0]
        pvalue = result[1]
        critical_values = result[4]
        is_stationary = pvalue < 0.05

        if verbose:
            print(f"\n  ADF stationarity test:")
            print(f"    Statistic: {adf_stat:.4f}, p-value: {pvalue:.6f}")
            print(f"    Critical values: 1%={critical_values['1%']:.3f}, "
                  f"5%={critical_values['5%']:.3f}, 10%={critical_values['10%']:.3f}")
            print(f"    Result: {'stationary' if is_stationary else 'NON-stationary'}")

        results = {
            'adf_statistic': adf_stat,
            'adf_pvalue': pvalue,
            'is_stationary': is_stationary
        }

        self.validation_results.update(results)
        return results

    def calculate_half_life(self, spread: pd.Series, verbose: bool = True) -> float:
        """
        Estimate mean-reversion speed (half-life) via Ornstein-Uhlenbeck regression.

        Returns:
            float: Half-life in days (lower = faster mean-reversion)
        """
        spread_lag = spread.shift(1).dropna()
        spread_diff = spread.diff().dropna()

        spread_lag, spread_diff = spread_lag.align(spread_diff, join='inner')

        # OLS regression: delta_y_t = alpha + beta * y_{t-1} + epsilon
        beta = np.polyfit(spread_lag, spread_diff, 1)[0]
        half_life = -np.log(2) / beta if beta < 0 else np.inf

        if verbose:
            print(f"\n  Half-life: {half_life:.2f} days")
            if half_life < 30:
                print(f"    Fast mean-reversion")
            elif half_life < 60:
                print(f"    Moderate mean-reversion")
            else:
                print(f"    Slow mean-reversion (patience required)")

        self.validation_results['half_life'] = half_life
        return half_life

    def calculate_hedge_ratio(self, method: Literal['ols', 'tls', 'kalman'] = 'ols') -> float:
        """
        Calculate optimal hedge ratio (beta) between assets.

        Args:
            method: 'ols' (ordinary least squares), 'tls' (total least squares),
                    or 'kalman' (time-varying via Kalman filter)

        Returns:
            float: Hedge ratio (units of Asset2 per unit of Asset1)
        """
        asset1 = self.df['Asset1_Close'].values
        asset2 = self.df['Asset2_Close'].values

        if method == 'ols':
            # Asset1 = alpha + beta*Asset2 + epsilon
            beta = np.polyfit(asset2, asset1, 1)[0]
        elif method == 'tls':
            # Total Least Squares accounts for noise in both variables
            from scipy.linalg import svd
            X = np.vstack([asset2, asset1]).T
            X_centered = X - X.mean(axis=0)
            U, s, Vt = svd(X_centered)
            beta = Vt[0, 1] / Vt[0, 0]
        elif method == 'kalman':
            # Kalman filter for time-varying hedge ratio estimation.
            # State: [alpha, beta] where Asset1 = alpha + beta * Asset2 + noise.
            # Produces a dynamically updated beta that adapts as the relationship evolves.
            asset1 = self.df['Asset1_Close'].values.astype(float)
            asset2 = self.df['Asset2_Close'].values.astype(float)
            n = len(asset1)

            # Initialize state at OLS estimate
            beta_ols = np.polyfit(asset2, asset1, 1)
            state = np.array([beta_ols[1], beta_ols[0]])  # [alpha, beta]

            P = np.eye(2) * 1.0       # state covariance
            Q = np.eye(2) * 1e-4      # process noise
            R = np.var(asset1) * 0.01  # observation noise

            betas = np.zeros(n)

            for t in range(n):
                H = np.array([[1.0, asset2[t]]])

                P_pred = P + Q

                y_pred = H @ state
                innovation = asset1[t] - y_pred[0]

                S = H @ P_pred @ H.T + R
                K = P_pred @ H.T / S[0, 0]

                state = state + K * innovation
                P = (np.eye(2) - K @ H) @ P_pred

                betas[t] = state[1]

            self.kalman_betas = pd.Series(betas, index=self.df.index)
            beta = betas[-1]

            print(f"\n  Hedge ratio (Kalman): final beta={beta:.4f}, "
                  f"initial beta={beta_ols[0]:.4f}, drift={betas[-1]-betas[0]:.4f}")
        else:
            raise ValueError(f"Unknown method: {method}")

        if method != 'kalman':
            print(f"\n  Hedge ratio ({method}): beta={beta:.4f} "
                  f"({self.asset2_ticker} per {self.asset1_ticker})")

        self.validation_results['hedge_ratio'] = beta
        return beta

    def compute_kalman_spread(self) -> pd.Series:
        """
        Compute spread using the time-varying Kalman filter hedge ratio.
        Requires calculate_hedge_ratio(method='kalman') to be called first.
        """
        if not hasattr(self, 'kalman_betas'):
            raise ValueError("Kalman betas not computed. Call calculate_hedge_ratio(method='kalman') first.")

        log1 = np.log(self.df['Asset1_Close'])
        log2 = np.log(self.df['Asset2_Close'])

        return log1 - self.kalman_betas * log2

    def calculate_rolling_cointegration(self, window: int = 252) -> pd.DataFrame:
        """
        Calculate rolling cointegration p-values to identify regime changes.

        Args:
            window: Rolling window in days (252=1yr, 126=6mo)

        Returns:
            DataFrame with columns: Coint_PValue, Is_Cointegrated
        """
        print(f"\nCalculating rolling cointegration (window={window} days)...")

        asset1 = self.df['Asset1_Close'].values
        asset2 = self.df['Asset2_Close'].values

        rolling_results = []

        for i in range(window, len(asset1)):
            asset1_window = asset1[i-window:i]
            asset2_window = asset2[i-window:i]

            try:
                _, pvalue, _ = coint(asset1_window, asset2_window)
                rolling_results.append({
                    'Date': self.df.index[i],
                    'Coint_PValue': pvalue,
                    'Is_Cointegrated': pvalue < 0.05
                })
            except Exception:
                rolling_results.append({
                    'Date': self.df.index[i],
                    'Coint_PValue': np.nan,
                    'Is_Cointegrated': False
                })

        results_df = pd.DataFrame(rolling_results).set_index('Date')

        valid_pvals = results_df['Coint_PValue'].dropna()
        pct_cointegrated = (results_df['Is_Cointegrated'].sum() / len(results_df)) * 100

        print(f"  Cointegrated: {pct_cointegrated:.1f}% of the time "
              f"(mean p-value: {valid_pvals.mean():.4f})")

        return results_df

    def prepare_strategy_data(self, spread_method: str = 'log') -> pd.DataFrame:
        """
        Fetch data, compute hedge ratio and spread, and run all validation tests.

        Returns:
            DataFrame ready for strategy backtesting
        """
        if self.df is None:
            self.fetch_data()

        df = self.df.copy()

        hedge_ratio = self.calculate_hedge_ratio(method='ols')
        df['Spread'] = self.compute_spread(method=spread_method, hedge_ratio=hedge_ratio)
        df['Hedge_Ratio'] = hedge_ratio

        self.test_cointegration()
        self.test_stationarity(df['Spread'])
        self.calculate_half_life(df['Spread'])

        print(self.generate_summary_report())

        return df

    def generate_summary_report(self) -> str:
        """Return a summary string of all validation test results."""
        if not self.validation_results:
            return "No validation performed yet."

        lines = [f"\nValidation summary: {self.pair_name}"]

        if self.validation_results.get('is_cointegrated'):
            lines.append("  Engle-Granger: cointegrated")
        else:
            lines.append("  Engle-Granger: NOT cointegrated")

        if 'johansen_is_cointegrated' in self.validation_results:
            if self.validation_results['johansen_is_cointegrated']:
                lines.append("  Johansen: cointegrated")
            else:
                lines.append("  Johansen: NOT cointegrated")

        if self.validation_results.get('is_stationary'):
            lines.append("  Spread: stationary")
        else:
            lines.append("  Spread: non-stationary (may trend)")

        hl = self.validation_results.get('half_life', float('inf'))
        if hl < 60:
            lines.append(f"  Half-life: {hl:.1f} days (tradeable)")
        else:
            lines.append(f"  Half-life: {hl:.1f} days (slow mean-reversion)")

        all_good = (
            self.validation_results.get('is_cointegrated', False) and
            self.validation_results.get('is_stationary', False) and
            hl < 60
        )

        lines.append(f"  Verdict: {'good pair for mean-reversion' if all_good else 'proceed with caution'}")

        return "\n".join(lines) + "\n"

    def get_pair_info(self) -> Dict:
        """Return metadata about the current pair."""
        return {
            'pair_name': self.pair_name,
            'asset1': self.asset1_ticker,
            'asset2': self.asset2_ticker,
            'data_points': len(self.df) if self.df is not None else 0,
            'date_range': (
                f"{self.df.index[0].date()} to {self.df.index[-1].date()}"
                if self.df is not None else "Not loaded"
            ),
            'validation': self.validation_results
        }

    def validate_timezone_alignment(self, df: pd.DataFrame, verbose: bool = True) -> bool:
        """
        Check that both assets trade in compatible timezones.

        Returns:
            bool: True if timezones are aligned
        """
        if df.index.tz is None:
            if verbose:
                print("Warning: data has no timezone info, assuming same timezone for both assets")
            return True

        asset1_hours = self._get_trading_hours(self.asset1_ticker)
        asset2_hours = self._get_trading_hours(self.asset2_ticker)

        if asset1_hours != asset2_hours:
            print(f"Timezone mismatch: {self.asset1_ticker} trades {asset1_hours}, "
                  f"{self.asset2_ticker} trades {asset2_hours}")
            print("This may cause look-ahead bias in backtests")
            return False

        if verbose:
            print(f"Timezone validation passed - both trade in {asset1_hours}")
        return True

    def align_timezones(self, df: pd.DataFrame, target_tz: str = 'America/New_York') -> pd.DataFrame:
        """
        Convert all data to a common timezone.

        Args:
            df: DataFrame with potentially mixed timezones
            target_tz: Target timezone (default: US Eastern)

        Returns:
            DataFrame with aligned timestamps
        """
        if df.index.tz is None:
            df.index = df.index.tz_localize('UTC')

        df.index = df.index.tz_convert(target_tz)

        print(f"All data converted to {target_tz}")
        return df