Stock_Predictor/Macro_Data.py at master · timdebord/Stock_Predictor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import os.path
import pandas as pd
import requests
import pandas_datareader as pdr
from datetime import datetime
import time

# FRED API details
fred_api_key = 'eaf273aa130ba994bee4b505b4e078a8'

# FRED API endpoint
fred_endpoint = 'https://api.stlouisfed.org/fred/series/observations'

# Set date range to maximum available
end_date = datetime.now()
# Most data sources don't have reliable data before 1970
start_date = datetime(1970, 1, 1)

# Function to get data from FRED
def get_fred_data(series_id, series_name=None):
    try:
        params = {
            'series_id': series_id,
            'api_key': fred_api_key,
            'file_type': 'json',
            # No start date specified to get all available data
            'observation_end': end_date.strftime('%Y-%m-%d')
        }

        response = requests.get(fred_endpoint, params=params)
        data = response.json()

        # Convert to DataFrame
        df = pd.DataFrame(data['observations'])
        df['date'] = pd.to_datetime(df['date'])
        df.set_index('date', inplace=True)
        column_name = series_name if series_name else series_id
        df[column_name] = pd.to_numeric(df['value'], errors='coerce')

        print(f"Successfully retrieved {column_name} data from FRED - {len(df)} records")
        return df[[column_name]]
    except Exception as e:
        print(f"Error fetching {series_id} from FRED: {e}")
        return pd.DataFrame()

# Function to get data using pandas_datareader
def get_yahoo_data(symbol, data_source='yahoo', symbol_name=None):
    try:
        print(f"Fetching data for {symbol}...")
        # We'll try with a very early start date to get maximum history
        data = pdr.DataReader(
            symbol,
            data_source,
            start=start_date,
            end=end_date
        )

        column_name = symbol_name if symbol_name else symbol
        data[column_name] = data['Close'] if 'Close' in data.columns else data.iloc[:, 0]

        print(f"Successfully retrieved {column_name} data - {len(data)} records")
        return data[[column_name]]
    except Exception as e:
        print(f"Error fetching {symbol}: {e}")

        # If Yahoo fails, try Stooq as a backup for some common indices
        if data_source == 'yahoo' and symbol in ['^VIX', '^GSPC', '^DJI', '^IXIC', '^TNX']:
            print(f"Trying alternative source (stooq) for {symbol}...")
            try:
                # Map Yahoo symbols to Stooq symbols
                stooq_map = {
                    '^VIX': '^VIX',
                    '^GSPC': '^SPX',
                    '^DJI': '^DJI',
                    '^IXIC': '^NDQ',
                    '^TNX': '^TNX'
                }
                stooq_symbol = stooq_map.get(symbol, symbol)
                data = pdr.DataReader(stooq_symbol, 'stooq', start=start_date, end=end_date)

                if 'Close' in data.columns:
                    column_name = symbol_name if symbol_name else symbol
                    data[column_name] = data['Close']
                    print(f"Successfully retrieved {column_name} data from Stooq - {len(data)} records")
                    return data[[column_name]]
            except Exception as e2:
                print(f"Alternative source also failed: {e2}")

        # Try other alternative data sources
        try_sources = ['av-daily', 'naver']  # Additional sources to try
        for alt_source in try_sources:
            try:
                print(f"Trying {alt_source} for {symbol}...")
                data = pdr.DataReader(symbol, alt_source, start=start_date, end=end_date)
                column_name = symbol_name if symbol_name else symbol
                if 'Close' in data.columns:
                    data[column_name] = data['Close']
                else:
                    data[column_name] = data.iloc[:, 0]
                print(f"Successfully retrieved {column_name} data from {alt_source} - {len(data)} records")
                return data[[column_name]]
            except Exception as alt_e:
                print(f"{alt_source} failed for {symbol}: {alt_e}")

        return pd.DataFrame()

# Dictionary of tickers to fetch with proper names
tickers = {
    '^VIX': 'VIX',           # VIX Volatility Index
    'DX-Y.NYB': 'USDX',      # US Dollar Index
    '^TNX': 'TNX',           # 10-Year Treasury Yield
    'CL=F': 'CrudeOil',      # Crude Oil Futures
    'GC=F': 'Gold',          # Gold Futures
    '^GSPC': 'SP500',        # S&P 500
    '^DJI': 'DOW',           # Dow Jones Industrial Average
    '^IXIC': 'NASDAQ',       # NASDAQ Composite
    '^RUT': 'RUSSELL',       # Russell 2000
    'EURUSD=X': 'EURUSD',    # EUR/USD Exchange Rate
    'ZB=F': 'T10Y',          # 10-Year Treasury Bond Futures
    'ZT=F': 'T2Y',           # 2-Year Treasury Note Futures
    '^HSI': 'HANGSENG'       # Hang Seng Index
}

# Add alternative symbols to try if the primary ones fail
alternative_tickers = {
    'VIX': 'VIX',           # Direct VIX without ^ prefix
    'SPY': 'SP500ETF',      # S&P 500 ETF as alternative to ^GSPC
    'DIA': 'DOWETF',        # Dow Jones ETF as alternative to ^DJI
    'QQQ': 'NASDAQETF',     # NASDAQ ETF as alternative to ^IXIC
    'IWM': 'RUSSELLETF',    # Russell 2000 ETF as alternative to ^RUT
    'USO': 'OilETF',        # Oil ETF as alternative to CL=F
    'GLD': 'GoldETF',       # Gold ETF as alternative to GC=F
    'TLT': 'T10YETF',       # Long-term Treasury ETF
    'SHY': 'T2YETF',        # Short-term Treasury ETF
    'UUP': 'USDollarETF',   # US Dollar ETF
    'EUO': 'EuroETF'        # Euro ETF
}

# FRED series to fetch
fred_series = {
    'EFFR': 'FedRate',       # Effective Federal Funds Rate
    'UNRATE': 'Unemployment', # Unemployment Rate
    'UMCSENT': 'ConsumerSentiment',  # Consumer Sentiment
    'T10Y2Y': 'YieldCurve',  # 10-Year Treasury Constant Maturity Minus 2-Year
    'CPIAUCSL': 'Inflation', # Consumer Price Index for All Urban Consumers
    'INDPRO': 'IndustrialProduction', # Industrial Production Index
    'RSAFS': 'RetailSales',  # Retail Sales
    'PCE': 'PersonalConsumptionExpenditure', # Personal Consumption Expenditures
    'FEDFUNDS': 'FederalFundsRate' # Federal Funds Effective Rate
}

# Try to get data
dfs = []

# Get Yahoo/Stooq data
print("Fetching market data...")
for symbol, name in tickers.items():
    df = get_yahoo_data(symbol, symbol_name=name)
    if not df.empty:
        dfs.append(df)
    else:
        # Try alternative if primary symbol fails
        print(f"Primary ticker {symbol} failed, trying alternatives...")
        for alt_symbol, alt_name in alternative_tickers.items():
            if alt_name.lower().startswith(name.lower()) or name.lower() in alt_name.lower():
                print(f"Trying alternative {alt_symbol} for {symbol}...")
                df = get_yahoo_data(alt_symbol, symbol_name=name)
                if not df.empty:
                    dfs.append(df)
                    break

    time.sleep(1)  # Add delay to avoid hitting rate limits

# Get FRED data
print("\nFetching economic data from FRED...")
for series_id, name in fred_series.items():
    df = get_fred_data(series_id, series_name=name)
    if not df.empty:
        dfs.append(df)
    time.sleep(0.5)  # Add smaller delay for FRED

# Merge all dataframes
if dfs:
    print("\nMerging all datasets...")
    merged_data = pd.concat(dfs, axis=1)

    # Fill missing values
    print("Handling missing values...")
    # First forward fill
    merged_data.fillna(method='ffill', inplace=True)
    # Then backward fill for any remaining missing values at the beginning
    merged_data.fillna(method='bfill', inplace=True)

    # Save to CSV file
    print("Saving to CSV...")
    merged_data.to_csv('merged_data.csv', index_label='date')

    print("\nData successfully saved to 'merged_data.csv'")

    # Print summary
    print("\nData Summary:")
    print(f"Date Range: {merged_data.index.min()} to {merged_data.index.max()}")
    print(f"Number of rows: {len(merged_data)}")
    print(f"Available columns: {', '.join(merged_data.columns)}")
    print("\nSample data (first 5 rows):")
    print(merged_data.head())
    print("\nSample data (last 5 rows):")
    print(merged_data.tail())
else:
    print("No data was successfully retrieved.")