diff --git a/README.md b/README.md index 94728b8ee..2e2fb0f88 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,10 @@ msft.info # get historical market data hist = msft.history(period="max") -# show actions (dividends, splits) +# show meta information about the history (requires history() to be called first) +msft.history_metadata + +# show actions (dividends, splits, capital gains) msft.actions # show dividends @@ -68,20 +71,24 @@ msft.dividends # show splits msft.splits + +# show capital gains (for mutual funds & etfs) +msft.capital_gains + # show share count msft.shares -# show income statement +# show financials: +# - income statement msft.income_stmt msft.quarterly_income_stmt - -# show balance sheet +# - balance sheet msft.balance_sheet msft.quarterly_balance_sheet - -# show cash flow statement +# - cash flow statement msft.cashflow msft.quarterly_cashflow +# see `Ticker.get_income_stmt()` for more options # show major holders msft.major_holders @@ -104,14 +111,15 @@ msft.recommendations msft.recommendations_summary # show analysts other work msft.analyst_price_target -mfst.revenue_forecasts -mfst.earnings_forecasts -mfst.earnings_trend +msft.revenue_forecasts +msft.earnings_forecasts +msft.earnings_trend # show next event (earnings, etc) msft.calendar -# show all earnings dates +# Show future and historic earnings dates, returns at most next 4 quarters and last 8 quarters by default. +# Note: If more are needed use msft.get_earnings_dates(limit=XX) with increased limit argument. msft.earnings_dates # show ISIN code - *experimental* @@ -140,6 +148,7 @@ msft.history(..., proxy="PROXY_SERVER") msft.get_actions(proxy="PROXY_SERVER") msft.get_dividends(proxy="PROXY_SERVER") msft.get_splits(proxy="PROXY_SERVER") +msft.get_capital_gains(proxy="PROXY_SERVER") msft.get_balance_sheet(proxy="PROXY_SERVER") msft.get_cashflow(proxy="PROXY_SERVER") msft.option_chain(..., proxy="PROXY_SERVER") @@ -154,7 +163,7 @@ the Ticker constructor. import requests_cache session = requests_cache.CachedSession('yfinance.cache') session.headers['User-agent'] = 'my-program/1.0' -ticker = yf.Ticker('msft aapl goog', session=session) +ticker = yf.Ticker('msft', session=session) # The scraped response will be stored in the cache ticker.actions ``` @@ -165,7 +174,6 @@ To initialize multiple `Ticker` objects, use import yfinance as yf tickers = yf.Tickers('msft aapl goog') -# ^ returns a named tuple of Ticker objects # access each ticker using (example) tickers.tickers['MSFT'].info @@ -195,7 +203,7 @@ data = yf.download( # or pdr.get_data_yahoo(... # fetch data by interval (including intraday if period < 60 days) # valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo # (optional, default is '1d') - interval = "1m", + interval = "5d", # Whether to ignore timezone when aligning ticker data from # different timezones. Default is True. False may be useful for @@ -289,12 +297,15 @@ To install `yfinance` using `conda`, see ### Requirements - [Python](https://www.python.org) \>= 2.7, 3.4+ -- [Pandas](https://github.com/pydata/pandas) (tested to work with - \>=0.23.1) -- [Numpy](http://www.numpy.org) \>= 1.11.1 -- [requests](http://docs.python-requests.org/en/master/) \>= 2.14.2 -- [lxml](https://pypi.org/project/lxml/) \>= 4.5.1 -- [appdirs](https://pypi.org/project/appdirs) \>=1.4.4 +- [Pandas](https://github.com/pydata/pandas) \>= 1.3.0 +- [Numpy](http://www.numpy.org) \>= 1.16.5 +- [requests](http://docs.python-requests.org/en/master) \>= 2.26 +- [lxml](https://pypi.org/project/lxml) \>= 4.9.1 +- [appdirs](https://pypi.org/project/appdirs) \>= 1.4.4 +- [pytz](https://pypi.org/project/pytz) \>=2022.5 +- [frozendict](https://pypi.org/project/frozendict) \>= 2.3.4 +- [beautifulsoup4](https://pypi.org/project/beautifulsoup4) \>= 4.11.1 +- [html5lib](https://pypi.org/project/html5lib) \>= 1.1 ### Optional (if you want to use `pandas_datareader`) diff --git a/meta.yaml b/meta.yaml index 9d97dcfc2..6d3345fdf 100644 --- a/meta.yaml +++ b/meta.yaml @@ -1,5 +1,5 @@ {% set name = "yfinance" %} -{% set version = "0.1.58" %} +{% set version = "0.2.0" %} package: name: "{{ name|lower }}" @@ -16,22 +16,30 @@ build: requirements: host: - - pandas >=0.24.0 + - pandas >=1.3.0 - numpy >=1.16.5 - - requests >=2.21 + - requests >=2.26 - multitasking >=0.0.7 - - lxml >=4.5.1 - - appdirs >= 1.4.4 + - lxml >=4.9.1 + - appdirs >=1.4.4 + - pytz >=2022.5 + - frozendict >=2.3.4 + - beautifulsoup4 >=4.11.1 + - html5lib >=1.1 - pip - python run: - - pandas >=0.24.0 + - pandas >=1.3.0 - numpy >=1.16.5 - - requests >=2.21 + - requests >=2.26 - multitasking >=0.0.7 - - lxml >=4.5.1 - - appdirs >= 1.4.4 + - lxml >=4.9.1 + - appdirs >=1.4.4 + - pytz >=2022.5 + - frozendict >=2.3.4 + - beautifulsoup4 >=4.11.1 + - html5lib >=1.1 - python test: diff --git a/requirements.txt b/requirements.txt index 28964912b..5f467b322 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ -pandas>=0.24.0 +pandas>=1.3.0 numpy>=1.16.5 requests>=2.26 multitasking>=0.0.7 -lxml>=4.5.1 +lxml>=4.9.1 appdirs>=1.4.4 pytz>=2022.5 frozendict>=2.3.4 beautifulsoup4>=4.11.1 -html5lib>=1.1 \ No newline at end of file +html5lib>=1.1 diff --git a/setup.py b/setup.py index fac4f1123..3261b09ef 100644 --- a/setup.py +++ b/setup.py @@ -59,9 +59,9 @@ platforms=['any'], keywords='pandas, yahoo finance, pandas datareader', packages=find_packages(exclude=['contrib', 'docs', 'tests', 'examples']), - install_requires=['pandas>=1.1.0', 'numpy>=1.15', + install_requires=['pandas>=1.3.0', 'numpy>=1.16.5', 'requests>=2.26', 'multitasking>=0.0.7', - 'lxml>=4.5.1', 'appdirs>=1.4.4', 'pytz>=2022.5', + 'lxml>=4.9.1', 'appdirs>=1.4.4', 'pytz>=2022.5', 'frozendict>=2.3.4', 'beautifulsoup4>=4.11.1', 'html5lib>=1.1'], entry_points={ diff --git a/tests/prices.py b/tests/prices.py index a0601a652..e0d722578 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -6,22 +6,21 @@ import pytz as _tz import numpy as _np import pandas as _pd -import os -# Create temp session -import requests_cache, tempfile - -td = tempfile.TemporaryDirectory() +import requests_cache class TestPriceHistory(unittest.TestCase): - def setUp(self): - global td - self.td = td - self.session = requests_cache.CachedSession(os.path.join(self.td.name, "yfinance.cache")) + session = None + + @classmethod + def setUpClass(cls): + cls.session = requests_cache.CachedSession(backend='memory') - def tearDown(self): - self.session.close() + @classmethod + def tearDownClass(cls): + if cls.session is not None: + cls.session.close() def test_daily_index(self): tkrs = ["BHP.AX", "IMP.JO", "BP.L", "PNL.L", "INTC"] @@ -37,6 +36,26 @@ def test_daily_index(self): f = df.index.time == _dt.time(0) self.assertTrue(f.all()) + def test_duplicatingHourly(self): + tkrs = ["IMP.JO", "BHG.JO", "SSW.JO", "BP.L", "INTC"] + for tkr in tkrs: + dat = yf.Ticker(tkr, session=self.session) + tz = dat._get_ticker_tz(debug_mode=False, proxy=None, timeout=None) + + dt_utc = _tz.timezone("UTC").localize(_dt.datetime.utcnow()) + dt = dt_utc.astimezone(_tz.timezone(tz)) + + df = dat.history(start=dt.date() - _dt.timedelta(days=1), interval="1h") + + dt0 = df.index[-2] + dt1 = df.index[-1] + try: + self.assertNotEqual(dt0.hour, dt1.hour) + except: + print("Ticker = ", tkr) + raise + + def test_duplicatingDaily(self): tkrs = ["IMP.JO", "BHG.JO", "SSW.JO", "BP.L", "INTC"] test_run = False @@ -251,11 +270,7 @@ def test_weekly_2rows_fix(self): df = dat.history(start=start, interval="1wk") self.assertTrue((df.index.weekday == 0).all()) - def test_repair_weekly_100x(self): - # Sometimes, Yahoo returns prices 100x the correct value. - # Suspect mixup between £/pence or $/cents etc. - # E.g. ticker PNL.L - + def test_repair_100x_weekly(self): # Setup: tkr = "PNL.L" dat = yf.Ticker(tkr, session=self.session) @@ -272,6 +287,7 @@ def test_repair_weekly_100x(self): _dt.date(2022, 10, 16), _dt.date(2022, 10, 9), _dt.date(2022, 10, 2)])) + df = df.sort_index() df.index.name = "Date" df_bad = df.copy() df_bad.loc["2022-10-23", "Close"] *= 100 @@ -286,7 +302,13 @@ def test_repair_weekly_100x(self): # First test - no errors left for c in data_cols: - self.assertTrue(_np.isclose(df_repaired[c], df[c], rtol=1e-2).all()) + try: + self.assertTrue(_np.isclose(df_repaired[c], df[c], rtol=1e-2).all()) + except: + print(df[c]) + print(df_repaired[c]) + raise + # Second test - all differences should be either ~1x or ~100x ratio = df_bad[data_cols].values / df[data_cols].values @@ -299,11 +321,7 @@ def test_repair_weekly_100x(self): f_1 = ratio == 1 self.assertTrue((f_100 | f_1).all()) - def test_repair_weekly_preSplit_100x(self): - # Sometimes, Yahoo returns prices 100x the correct value. - # Suspect mixup between £/pence or $/cents etc. - # E.g. ticker PNL.L - + def test_repair_100x_weekly_preSplit(self): # PNL.L has a stock-split in 2022. Sometimes requesting data before 2022 is not split-adjusted. tkr = "PNL.L" @@ -321,6 +339,7 @@ def test_repair_weekly_preSplit_100x(self): _dt.date(2020, 3, 23), _dt.date(2020, 3, 16), _dt.date(2020, 3, 9)])) + df = df.sort_index() # Simulate data missing split-adjustment: df[data_cols] *= 100.0 df["Volume"] *= 0.01 @@ -359,11 +378,7 @@ def test_repair_weekly_preSplit_100x(self): f_1 = ratio == 1 self.assertTrue((f_100 | f_1).all()) - def test_repair_daily_100x(self): - # Sometimes, Yahoo returns prices 100x the correct value. - # Suspect mixup between £/pence or $/cents etc. - # E.g. ticker PNL.L - + def test_repair_100x_daily(self): tkr = "PNL.L" dat = yf.Ticker(tkr, session=self.session) tz_exchange = dat.info["exchangeTimezoneName"] @@ -379,6 +394,7 @@ def test_repair_daily_100x(self): _dt.date(2022, 10, 31), _dt.date(2022, 10, 28), _dt.date(2022, 10, 27)])) + df = df.sort_index() df.index.name = "Date" df_bad = df.copy() df_bad.loc["2022-11-01", "Close"] *= 100 @@ -404,10 +420,7 @@ def test_repair_daily_100x(self): f_1 = ratio == 1 self.assertTrue((f_100 | f_1).all()) - def test_repair_daily_zeroes(self): - # Sometimes Yahoo returns price=0.0 when price obviously not zero - # E.g. ticker BBIL.L - + def test_repair_zeroes_daily(self): tkr = "BBIL.L" dat = yf.Ticker(tkr, session=self.session) tz_exchange = dat.info["exchangeTimezoneName"] @@ -421,23 +434,61 @@ def test_repair_daily_zeroes(self): index=_pd.to_datetime([_dt.datetime(2022, 11, 1), _dt.datetime(2022, 10, 31), _dt.datetime(2022, 10, 30)])) + df_bad = df_bad.sort_index() df_bad.index.name = "Date" df_bad.index = df_bad.index.tz_localize(tz_exchange) - repaired_df = dat._fix_zero_prices(df_bad, "1d", tz_exchange) + repaired_df = dat._fix_zeroes(df_bad, "1d", tz_exchange) correct_df = df_bad.copy() - correct_df.loc[correct_df.index[0], "Open"] = 102.080002 - correct_df.loc[correct_df.index[0], "Low"] = 102.032501 - correct_df.loc[correct_df.index[0], "High"] = 102.080002 + correct_df.loc["2022-11-01", "Open"] = 102.080002 + correct_df.loc["2022-11-01", "Low"] = 102.032501 + correct_df.loc["2022-11-01", "High"] = 102.080002 for c in ["Open", "Low", "High", "Close"]: self.assertTrue(_np.isclose(repaired_df[c], correct_df[c], rtol=1e-8).all()) -try: - if __name__ == '__main__': - unittest.main() -finally: - td.cleanup() + def test_repair_zeroes_hourly(self): + tkr = "INTC" + dat = yf.Ticker(tkr, session=self.session) + tz_exchange = dat.info["exchangeTimezoneName"] + + df_bad = _pd.DataFrame(data={"Open": [29.68, 29.49, 29.545, _np.nan, 29.485], + "High": [29.68, 29.625, 29.58, _np.nan, 29.49], + "Low": [29.46, 29.4, 29.45, _np.nan, 29.31], + "Close": [29.485, 29.545, 29.485, _np.nan, 29.325], + "Adj Close": [29.485, 29.545, 29.485, _np.nan, 29.325], + "Volume": [3258528, 2140195, 1621010, 0, 0]}, + index=_pd.to_datetime([_dt.datetime(2022,11,25, 9,30), + _dt.datetime(2022,11,25, 10,30), + _dt.datetime(2022,11,25, 11,30), + _dt.datetime(2022,11,25, 12,30), + _dt.datetime(2022,11,25, 13,00)])) + df_bad = df_bad.sort_index() + df_bad.index.name = "Date" + df_bad.index = df_bad.index.tz_localize(tz_exchange) + + repaired_df = dat._fix_zeroes(df_bad, "1h", tz_exchange) + + correct_df = df_bad.copy() + idx = _pd.Timestamp(2022,11,25, 12,30).tz_localize(tz_exchange) + correct_df.loc[idx, "Open"] = 29.485001 + correct_df.loc[idx, "High"] = 29.49 + correct_df.loc[idx, "Low"] = 29.43 + correct_df.loc[idx, "Close"] = 29.455 + correct_df.loc[idx, "Adj Close"] = 29.455 + correct_df.loc[idx, "Volume"] = 609164 + for c in ["Open", "Low", "High", "Close"]: + try: + self.assertTrue(_np.isclose(repaired_df[c], correct_df[c], rtol=1e-7).all()) + except: + print("COLUMN", c) + print(repaired_df) + print(correct_df[c]) + print(repaired_df[c] - correct_df[c]) + raise + +if __name__ == '__main__': + unittest.main() # # Run tests sequentially: # import inspect @@ -446,4 +497,3 @@ def test_repair_daily_zeroes(self): # test_src.index(f"def {x}") - test_src.index(f"def {y}") # ) # unittest.main(verbosity=2) - diff --git a/tests/ticker.py b/tests/ticker.py index 9f9a6207a..d75b1c6fe 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -16,7 +16,7 @@ import requests_cache # Set this to see the exact requests that are made during tests -DEBUG_LOG_REQUESTS = True +DEBUG_LOG_REQUESTS = False if DEBUG_LOG_REQUESTS: import logging @@ -29,7 +29,7 @@ class TestTicker(unittest.TestCase): @classmethod def setUpClass(cls): - cls.session = requests_cache.CachedSession() + cls.session = requests_cache.CachedSession(backend='memory') @classmethod def tearDownClass(cls): @@ -86,22 +86,102 @@ def test_badTicker(self): dat.earnings_dates dat.earnings_forecasts + def test_goodTicker(self): + # that yfinance works when full api is called on same instance of ticker -class TestTickerEarnings(unittest.TestCase): + tkr = "IBM" + dat = yf.Ticker(tkr, session=self.session) + + dat.isin + dat.major_holders + dat.institutional_holders + dat.mutualfund_holders + dat.dividends + dat.splits + dat.actions + dat.shares + dat.info + dat.calendar + dat.recommendations + dat.earnings + dat.quarterly_earnings + dat.income_stmt + dat.quarterly_income_stmt + dat.balance_sheet + dat.quarterly_balance_sheet + dat.cashflow + dat.quarterly_cashflow + dat.recommendations_summary + dat.analyst_price_target + dat.revenue_forecasts + dat.sustainability + dat.options + dat.news + dat.earnings_trend + dat.earnings_dates + dat.earnings_forecasts + + dat.history(period="1wk") + dat.history(start="2022-01-01") + dat.history(start="2022-01-01", end="2022-03-01") + yf.download([tkr], period="1wk") + +class TestTickerHistory(unittest.TestCase): def setUp(self): - self.ticker = yf.Ticker("GOOGL") + # use a ticker that has dividends + self.ticker = yf.Ticker("IBM") def tearDown(self): self.ticker = None - def test_earnings_history(self): - data = self.ticker.earnings_history + def test_history(self): + with self.assertRaises(RuntimeError): + self.ticker.history_metadata + data = self.ticker.history("1y") + self.assertIn("IBM", self.ticker.history_metadata.values(), "metadata missing") + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + + def test_no_expensive_calls_introduced(self): + """ + Make sure calling history to get price data has not introduced more calls to yahoo than absolutely necessary. + As doing other type of scraping calls than "query2.finance.yahoo.com/v8/finance/chart" to yahoo website + will quickly trigger spam-block when doing bulk download of history data. + """ + session = requests_cache.CachedSession(backend='memory') + ticker = yf.Ticker("GOOGL", session=session) + ticker.history("1y") + actual_urls_called = tuple([r.url for r in session.cache.filter()]) + session.close() + expected_urls = ( + 'https://query2.finance.yahoo.com/v8/finance/chart/GOOGL?range=1y&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains', + ) + self.assertEqual(expected_urls, actual_urls_called, "Different than expected url used to fetch history.") + + def test_dividends(self): + data = self.ticker.dividends + self.assertIsInstance(data, pd.Series, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + + def test_splits(self): + data = self.ticker.splits + self.assertIsInstance(data, pd.Series, "data has wrong type") + # self.assertFalse(data.empty, "data is empty") + + def test_actions(self): + data = self.ticker.actions self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") - data_cached = self.ticker.earnings_history - self.assertIs(data, data_cached, "data not cached") + +class TestTickerEarnings(unittest.TestCase): + + def setUp(self): + self.ticker = yf.Ticker("GOOGL") + + def tearDown(self): + self.ticker = None def test_earnings(self): data = self.ticker.earnings @@ -143,6 +223,18 @@ def test_earnings_trend(self): data_cached = self.ticker.earnings_trend self.assertIs(data, data_cached, "data not cached") + def test_earnings_dates_with_limit(self): + # use ticker with lots of historic earnings + ticker = yf.Ticker("IBM") + limit = 110 + data = ticker.get_earnings_dates(limit=limit) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + self.assertEqual(len(data), limit, "Wrong number or rows") + + data_cached = ticker.get_earnings_dates(limit=limit) + self.assertIs(data, data_cached, "data not cached") + class TestTickerHolders(unittest.TestCase): @@ -178,53 +270,233 @@ def test_mutualfund_holders(self): class TestTickerMiscFinancials(unittest.TestCase): + session = None + + @classmethod + def setUpClass(cls): + cls.session = requests_cache.CachedSession(backend='memory') + + @classmethod + def tearDownClass(cls): + if cls.session is not None: + cls.session.close() def setUp(self): - self.ticker = yf.Ticker("GOOGL") + self.ticker = yf.Ticker("GOOGL", session=self.session) + + # For ticker 'BSE.AX' (and others), Yahoo not returning + # full quarterly financials (usually cash-flow) with all entries, + # instead returns a smaller version in different data store. + self.ticker_old_fmt = yf.Ticker("BSE.AX", session=self.session) def tearDown(self): self.ticker = None - def test_balance_sheet(self): - expected_row = "TotalAssets" - data = self.ticker.balance_sheet + def test_income_statement(self): + expected_keys = ["Total Revenue", "Basic EPS"] + expected_periods_days = 365 + + # Test contents of table + data = self.ticker.get_income_stmt(pretty=True) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning annual financials") + + # Test property defaults + data2 = self.ticker.income_stmt + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") + + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_income_stmt(pretty=False) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") + + + def test_quarterly_income_statement(self): + expected_keys = ["Total Revenue", "Basic EPS"] + expected_periods_days = 365//4 + + # Test contents of table + data = self.ticker.get_income_stmt(pretty=True, freq="quarterly") + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning quarterly financials") + + # Test property defaults + data2 = self.ticker.quarterly_income_stmt + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") + + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_income_stmt(pretty=False, freq="quarterly") + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") + + def test_quarterly_income_statement_old_fmt(self): + expected_row = "TotalRevenue" + data = self.ticker_old_fmt.quarterly_income_stmt self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") self.assertIn(expected_row, data.index, "Did not find expected row in index") - data_cached = self.ticker.balance_sheet + data_cached = self.ticker_old_fmt.quarterly_income_stmt self.assertIs(data, data_cached, "data not cached") - def test_quarterly_balance_sheet(self): + def test_balance_sheet(self): + expected_keys = ["Total Assets", "Net PPE"] + expected_periods_days = 365 + + # Test contents of table + data = self.ticker.get_balance_sheet(pretty=True) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning annual financials") + + # Test property defaults + data2 = self.ticker.balance_sheet + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") + + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_balance_sheet(pretty=False) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") + + def test_quarterly_balance_sheet_old_fmt(self): expected_row = "TotalAssets" - data = self.ticker.quarterly_balance_sheet + data = self.ticker_old_fmt.quarterly_balance_sheet self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") self.assertIn(expected_row, data.index, "Did not find expected row in index") - data_cached = self.ticker.quarterly_balance_sheet + data_cached = self.ticker_old_fmt.quarterly_balance_sheet self.assertIs(data, data_cached, "data not cached") - def test_cashflow(self): - expected_row = "OperatingCashFlow" - data = self.ticker.cashflow + def test_quarterly_balance_sheet(self): + expected_keys = ["Total Assets", "Net PPE"] + expected_periods_days = 365//4 + + # Test contents of table + data = self.ticker.get_balance_sheet(pretty=True, freq="quarterly") self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") - self.assertIn(expected_row, data.index, "Did not find expected row in index") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning quarterly financials") + + # Test property defaults + data2 = self.ticker.quarterly_balance_sheet + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") + + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_balance_sheet(pretty=False, freq="quarterly") + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") - data_cached = self.ticker.cashflow - self.assertIs(data, data_cached, "data not cached") + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") + + def test_cash_flow(self): + expected_keys = ["Operating Cash Flow", "Net PPE Purchase And Sale"] + expected_periods_days = 365 + + # Test contents of table + data = self.ticker.get_cashflow(pretty=True) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning annual financials") + + # Test property defaults + data2 = self.ticker.cashflow + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") + + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_cashflow(pretty=False) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") - def test_quarterly_cashflow(self): - expected_row = "OperatingCashFlow" - data = self.ticker.quarterly_cashflow + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") + + def test_quarterly_cashflow_old_fmt(self): + expected_row = "NetIncome" + data = self.ticker_old_fmt.quarterly_cashflow self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") self.assertIn(expected_row, data.index, "Did not find expected row in index") - data_cached = self.ticker.quarterly_cashflow + data_cached = self.ticker_old_fmt.quarterly_cashflow self.assertIs(data, data_cached, "data not cached") + def test_quarterly_cash_flow(self): + expected_keys = ["Operating Cash Flow", "Net PPE Purchase And Sale"] + expected_periods_days = 365//4 + + # Test contents of table + data = self.ticker.get_cashflow(pretty=True, freq="quarterly") + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning quarterly financials") + + # Test property defaults + data2 = self.ticker.quarterly_cashflow + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") + + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_cashflow(pretty=False, freq="quarterly") + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") + def test_sustainability(self): data = self.ticker.sustainability self.assertIsInstance(data, pd.DataFrame, "data has wrong type") @@ -286,12 +558,27 @@ def test_options(self): self.assertIsInstance(data, tuple, "data has wrong type") self.assertTrue(len(data) > 1, "data is empty") + def test_shares(self): + data = self.ticker.shares + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + + def test_info(self): + data = self.ticker.info + self.assertIsInstance(data, dict, "data has wrong type") + self.assertIn("symbol", data.keys(), "Did not find expected key in info dict") + self.assertEqual("GOOGL", data["symbol"], "Wrong symbol value in info dict") + + def test_bad_freq_value_raises_exception(self): + self.assertRaises(ValueError, lambda: self.ticker.get_cashflow(freq="badarg")) + def suite(): suite = unittest.TestSuite() suite.addTest(TestTicker('Test ticker')) suite.addTest(TestTickerEarnings('Test earnings')) suite.addTest(TestTickerHolders('Test holders')) + suite.addTest(TestTickerHistory('Test Ticker history')) suite.addTest(TestTickerMiscFinancials('Test misc financials')) return suite diff --git a/yfinance/base.py b/yfinance/base.py index 3a525edc3..a5070fb88 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -23,8 +23,11 @@ import time as _time import datetime as _datetime +from typing import Optional + import pandas as _pd import numpy as _np +import pandas as pd from .data import TickerData @@ -32,9 +35,11 @@ from . import utils -import json as _json - from . import shared +from .scrapers.analysis import Analysis +from .scrapers.fundamentals import Fundamentals +from .scrapers.holders import Holders +from .scrapers.quote import Quote _BASE_URL_ = 'https://query2.finance.yahoo.com' _SCRAPE_URL_ = 'https://finance.yahoo.com/quote' @@ -46,31 +51,16 @@ def __init__(self, ticker, session=None): self.ticker = ticker.upper() self.session = session self._history = None + self._history_metadata = None self._base_url = _BASE_URL_ self._scrape_url = _SCRAPE_URL_ self._tz = None - self._fundamentals = False - self._info = None - self._earnings_trend = None - self._sustainability = None - self._recommendations = None - self._analyst_trend_details = None - self._analyst_price_target = None - self._rev_est = None - self._eps_est = None - - self._major_holders = None - self._institutional_holders = None - self._mutualfund_holders = None self._isin = None self._news = [] self._shares = None - self._calendar = None - self._expirations = {} - self._earnings_dates = None - self._earnings_history = None + self._earnings_dates = {} self._earnings = None self._financials = None @@ -79,24 +69,25 @@ def __init__(self, ticker, session=None): if utils.is_isin(self.ticker): self.ticker = utils.get_ticker_by_isin(self.ticker, None, session) - self._data = TickerData(self.ticker, session=session) - - def stats(self, proxy=None): + self._data: TickerData = TickerData(self.ticker, session=session) - if self._fundamentals: - return + self._analysis = Analysis(self._data) + self._holders = Holders(self._data) + self._quote = Quote(self._data) + self._fundamentals = Fundamentals(self._data) + def stats(self, proxy=None): ticker_url = "{}/{}".format(self._scrape_url, self.ticker) # get info and sustainability - data = self._data.get_json_data_stores(ticker_url, proxy)["QuoteSummaryStore"] + data = self._data.get_json_data_stores(proxy=proxy)["QuoteSummaryStore"] return data def history(self, period="1mo", interval="1d", start=None, end=None, prepost=False, actions=True, auto_adjust=True, back_adjust=False, repair=False, keepna=False, proxy=None, rounding=False, timeout=10, - debug=True, raise_errors=False): + debug=True, raise_errors=False) -> pd.DataFrame: """ :Parameters: period : str @@ -164,7 +155,8 @@ def history(self, period="1mo", interval="1d", if interval == "1m": start = end - 604800 # Subtract 7 days else: - start = -631159200 + _UNIX_TIMESTAMP_1900 = -2208994789 + start = _UNIX_TIMESTAMP_1900 else: start = utils._parse_user_dt(start, tz) params = {"period1": start, "period2": end} @@ -174,7 +166,6 @@ def history(self, period="1mo", interval="1d", params["interval"] = interval.lower() params["includePrePost"] = prepost - params["events"] = "div,splits" # 1) fix weired bug with Yahoo! - returning 60m for 30m bars if params["interval"] == "30m": @@ -186,13 +177,24 @@ def history(self, period="1mo", interval="1d", proxy = proxy["https"] proxy = {"https": proxy} + #if the ticker is MUTUALFUND or ETF, then get capitalGains events + params["events"] = "div,splits,capitalGains" + # Getting data from json url = "{}/v8/finance/chart/{}".format(self._base_url, self.ticker) data = None try: - data = self._data.get( + get_fn = self._data.get + if end is not None: + end_dt = _pd.Timestamp(end, unit='s').tz_localize("UTC") + dt_now = end_dt.tzinfo.localize(_datetime.datetime.utcnow()) + data_delay = _datetime.timedelta(minutes=30) + if end_dt+data_delay <= dt_now: + # Date range in past so safe to fetch through cache: + get_fn = self._data.cache_get + data = get_fn( url=url, params=params, timeout=timeout @@ -206,6 +208,12 @@ def history(self, period="1mo", interval="1d", except Exception: pass + # Store the meta data that gets retrieved simultaneously + try: + self._history_metadata = data["chart"]["result"][0]["meta"] + except KeyError: + self._history_metadata = {} + err_msg = "No data found for this date range, symbol may be delisted" fail = False if data is None or not type(data) is dict: @@ -219,9 +227,9 @@ def history(self, period="1mo", interval="1d", elif "chart" not in data or data["chart"]["result"] is None or not data["chart"]["result"]: fail = True elif period is not None and "timestamp" not in data["chart"]["result"][0] and period not in \ - data["chart"]["result"][0]["meta"]["validRanges"]: + self._history_metadata["validRanges"]: # User provided a bad period. The minimum should be '1d', but sometimes Yahoo accepts '1h'. - err_msg = "Period '{}' is invalid, must be one of {}".format(period, data["chart"]["result"][0]["meta"][ + err_msg = "Period '{}' is invalid, must be one of {}".format(period, self._history_metadata[ "validRanges"]) fail = True if fail: @@ -233,7 +241,7 @@ def history(self, period="1mo", interval="1d", else: print('%s: %s' % (self.ticker, err_msg)) return utils.empty_df() - + # parse quotes try: quotes = utils.parse_quotes(data["chart"]["result"][0]) @@ -272,70 +280,56 @@ def history(self, period="1mo", interval="1d", except Exception: pass - tz_exchange = data["chart"]["result"][0]["meta"]["exchangeTimezoneName"] + # Select useful info from metadata + quote_type = self._history_metadata["instrumentType"] + expect_capital_gains = quote_type in ('MUTUALFUND', 'ETF') + tz_exchange = self._history_metadata["exchangeTimezoneName"] # Note: ordering is important. If you change order, run the tests! quotes = utils.set_df_tz(quotes, params["interval"], tz_exchange) quotes = utils.fix_Yahoo_dst_issue(quotes, params["interval"]) quotes = utils.fix_Yahoo_returning_live_separate(quotes, params["interval"], tz_exchange) - if repair: - # Do this before auto/back adjust - quotes = self._fix_zero_prices(quotes, interval, tz_exchange) - quotes = self._fix_unit_mixups(quotes, interval, tz_exchange) - - # Auto/back adjust - try: - if auto_adjust: - quotes = utils.auto_adjust(quotes) - elif back_adjust: - quotes = utils.back_adjust(quotes) - except Exception as e: - if auto_adjust: - err_msg = "auto_adjust failed with %s" % e - else: - err_msg = "back_adjust failed with %s" % e - shared._DFS[self.ticker] = utils.empty_df() - shared._ERRORS[self.ticker] = err_msg - if debug: - if raise_errors: - raise Exception('%s: %s' % (self.ticker, err_msg)) - else: - print('%s: %s' % (self.ticker, err_msg)) - - if rounding: - quotes = _np.round(quotes, data[ - "chart"]["result"][0]["meta"]["priceHint"]) - quotes['Volume'] = quotes['Volume'].fillna(0).astype(_np.int64) # actions - dividends, splits = utils.parse_actions(data["chart"]["result"][0]) + dividends, splits, capital_gains = utils.parse_actions(data["chart"]["result"][0]) + if not expect_capital_gains: + capital_gains = None + if start is not None: - startDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(start)) + # Note: use pandas Timestamp as datetime.utcfromtimestamp has bugs on windows + # https://github.com/python/cpython/issues/81708 + startDt = _pd.Timestamp(start, unit='s') if dividends is not None: - dividends = dividends[dividends.index >= startDt] + dividends = dividends[dividends.index>=startDt] + if capital_gains is not None: + capital_gains = capital_gains[capital_gains.index>=startDt] if splits is not None: splits = splits[splits.index >= startDt] if end is not None: - endDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(end)) + endDt = _pd.Timestamp(end, unit='s') if dividends is not None: - dividends = dividends[dividends.index < endDt] + dividends = dividends[dividends.index 0: - dividends.index = _pd.to_datetime(dividends.index.date).tz_localize(tz_exchange, ambiguous=True) + dividends.index = _pd.to_datetime(dividends.index.date).tz_localize(tz_exchange, ambiguous=True, nonexistent='shift_forward') if splits.shape[0] > 0: - splits.index = _pd.to_datetime(splits.index.date).tz_localize(tz_exchange, ambiguous=True) + splits.index = _pd.to_datetime(splits.index.date).tz_localize(tz_exchange, ambiguous=True, nonexistent='shift_forward') # Combine df = quotes.sort_index() @@ -351,6 +345,42 @@ def history(self, period="1mo", interval="1d", df.loc[df["Stock Splits"].isna(), "Stock Splits"] = 0 else: df["Stock Splits"] = 0.0 + if expect_capital_gains: + if capital_gains.shape[0] > 0: + df = utils.safe_merge_dfs(df, capital_gains, interval) + if "Capital Gains" in df.columns: + df.loc[df["Capital Gains"].isna(),"Capital Gains"] = 0 + else: + df["Capital Gains"] = 0.0 + + if repair: + # Do this before auto/back adjust + df = self._fix_zeroes(df, interval, tz_exchange) + df = self._fix_unit_mixups(df, interval, tz_exchange) + + # Auto/back adjust + try: + if auto_adjust: + df = utils.auto_adjust(df) + elif back_adjust: + df = utils.back_adjust(df) + except Exception as e: + if auto_adjust: + err_msg = "auto_adjust failed with %s" % e + else: + err_msg = "back_adjust failed with %s" % e + shared._DFS[self.ticker] = utils.empty_df() + shared._ERRORS[self.ticker] = err_msg + if debug: + if raise_errors: + raise Exception('%s: %s' % (self.ticker, err_msg)) + else: + print('%s: %s' % (self.ticker, err_msg)) + + if rounding: + df = _np.round(df, data[ + "chart"]["result"][0]["meta"]["priceHint"]) + df['Volume'] = df['Volume'].fillna(0).astype(_np.int64) if intraday: df.index.name = "Datetime" @@ -361,7 +391,7 @@ def history(self, period="1mo", interval="1d", df = df[~df.index.duplicated(keep='first')] self._history = df.copy() if not actions: - df = df.drop(columns=["Dividends", "Stock Splits"]) + df = df.drop(columns=["Dividends", "Stock Splits", "Capital Gains"], errors='ignore') if not keepna: mask_nan_or_zero = (df.isna() | (df == 0)).all(axis=1) df = df.drop(mask_nan_or_zero.index[mask_nan_or_zero]) @@ -370,15 +400,16 @@ def history(self, period="1mo", interval="1d", # ------------------------ - def _reconstruct_interval(self, df_row, interval, bad_fields): - if isinstance(df_row, _pd.DataFrame) or not isinstance(df_row, _pd.Series): - raise Exception("'df_row' must be a Pandas Series not", type(df_row)) - if not isinstance(bad_fields, (list,set,_np.ndarray)): - raise Exception("'bad_fields' must be a list/set not", type(bad_fields)) + def _reconstruct_intervals_batch(self, df, interval, tag=-1): + if not isinstance(df, _pd.DataFrame): + raise Exception("'df' must be a Pandas DataFrame not", type(df)) - data_cols = [c for c in ["Open","High","Low","Close","Adj Close"] if c in df_row.index] + # Reconstruct values in df using finer-grained price data. Delimiter marks what to reconstruct - # If interval is weekly then can construct with daily. But if smaller intervals then + price_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df] + data_cols = price_cols + ["Volume"] + + # If interval is weekly then can construct with daily. But if smaller intervals then # restricted to recent times: # - daily = hourly restricted to last 730 days sub_interval = None @@ -391,83 +422,205 @@ def _reconstruct_interval(self, df_row, interval, bad_fields): # Correct by fetching day of hourly data sub_interval = "1h" td_range = _datetime.timedelta(days=1) + elif interval == "1h": + sub_interval = "30m" + td_range = _datetime.timedelta(hours=1) else: print("WARNING: Have not implemented repair for '{}' interval. Contact developers".format(interval)) - return df_row + raise Exception("why here") + return df - idx = df_row.name - start = idx.date() - if sub_interval=="1h" and (_datetime.date.today()-start) > _datetime.timedelta(days=729): - # Don't bother requesting more price data, Yahoo will reject - return None + df = df.sort_index() + + f_repair = df[data_cols].to_numpy()==tag + f_repair_rows = f_repair.any(axis=1) + + # Ignore old intervals for which Yahoo won't return finer data: + if sub_interval == "1h": + f_recent = _datetime.date.today() - df.index.date < _datetime.timedelta(days=730) + f_repair_rows = f_repair_rows & f_recent + elif sub_interval in ["30m", "15m"]: + f_recent = _datetime.date.today() - df.index.date < _datetime.timedelta(days=60) + f_repair_rows = f_repair_rows & f_recent + if not f_repair_rows.any(): + print("data too old to fix") + return df + + dts_to_repair = df.index[f_repair_rows] + indices_to_repair = _np.where(f_repair_rows)[0] + + if len(dts_to_repair) == 0: + return df + + df_v2 = df.copy() + df_noNa = df[~df[price_cols].isna().any(axis=1)] + + # Group nearby NaN-intervals together to reduce number of Yahoo fetches + dts_groups = [[dts_to_repair[0]]] + last_dt = dts_to_repair[0] + last_ind = indices_to_repair[0] + td = utils._interval_to_timedelta(interval) + if interval == "1mo": + grp_td_threshold = _datetime.timedelta(days=28) + elif interval == "1wk": + grp_td_threshold = _datetime.timedelta(days=28) + elif interval == "1d": + grp_td_threshold = _datetime.timedelta(days=14) + elif interval == "1h": + grp_td_threshold = _datetime.timedelta(days=7) else: - new_vals = {} + grp_td_threshold = _datetime.timedelta(days=2) + # grp_td_threshold = _datetime.timedelta(days=7) + for i in range(1, len(dts_to_repair)): + ind = indices_to_repair[i] + dt = dts_to_repair[i] + if (dt-dts_groups[-1][-1]) < grp_td_threshold: + dts_groups[-1].append(dt) + elif ind - last_ind <= 3: + dts_groups[-1].append(dt) + else: + dts_groups.append([dt]) + last_dt = dt + last_ind = ind + + # Add some good data to each group, so can calibrate later: + for i in range(len(dts_groups)): + g = dts_groups[i] + g0 = g[0] + i0 = df_noNa.index.get_loc(g0) + if i0 > 0: + dts_groups[i].insert(0, df_noNa.index[i0-1]) + gl = g[-1] + il = df_noNa.index.get_loc(gl) + if il < len(df_noNa)-1: + dts_groups[i].append(df_noNa.index[il+1]) - if sub_interval=="1h": - df_fine = self.history(start=start, end=start+td_range, interval=sub_interval, auto_adjust=False) + n_fixed = 0 + for g in dts_groups: + df_block = df[df.index.isin(g)] + + start_dt = g[0] + start_d = start_dt.date() + if sub_interval == "1h" and (_datetime.date.today() - start_d) > _datetime.timedelta(days=729): + # Don't bother requesting more price data, Yahoo will reject + continue + elif sub_interval in ["30m", "15m"] and (_datetime.date.today() - start_d) > _datetime.timedelta(days=59): + # Don't bother requesting more price data, Yahoo will reject + continue + + td_1d = _datetime.timedelta(days=1) + if interval in "1wk": + fetch_start = start_d - td_range # need previous week too + fetch_end = g[-1].date() + td_range + elif interval == "1d": + fetch_start = start_d + fetch_end = g[-1].date() + td_range + else: + fetch_start = g[0] + fetch_end = g[-1] + td_range + + prepost = interval == "1d" + df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, prepost=prepost, repair=False, keepna=True) + if df_fine is None or df_fine.empty: + print("YF: WARNING: Cannot reconstruct because Yahoo not returning data in interval") + continue + + df_fine["ctr"] = 0 + if interval == "1wk": + # df_fine["Week Start"] = df_fine.index.tz_localize(None).to_period("W-SUN").start_time + weekdays = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] + week_end_day = weekdays[(df_block.index[0].weekday()+7-1)%7] + df_fine["Week Start"] = df_fine.index.tz_localize(None).to_period("W-"+week_end_day).start_time + grp_col = "Week Start" + elif interval == "1d": + df_fine["Day Start"] = pd.to_datetime(df_fine.index.date) + grp_col = "Day Start" else: - df_fine = self.history(start=start-td_range, end=start+td_range, interval=sub_interval, auto_adjust=False) - - # First, check whether df_fine has different split-adjustment than df_row. - # If it is different, then adjust df_fine to match df_row - good_fields = list(set(data_cols)-set(bad_fields)-set("Adj Close")) - if len(good_fields)==0: - raise Exception("No good fields, so cannot determine whether different split-adjustment. Contact developers") - # median = df_row.loc[good_fields].median() - # median_fine = _np.median(df_fine[good_fields].values) - # ratio = median/median_fine - # Better method to calculate split-adjustment: - df_fine_from_idx = df_fine[df_fine.index>=idx] - ratios = [] - for f in good_fields: - if f=="Low": - ratios.append(df_row[f] / df_fine_from_idx[f].min()) - elif f=="High": - ratios.append(df_row[f] / df_fine_from_idx[f].max()) - elif f=="Open": - ratios.append(df_row[f] / df_fine_from_idx[f].iloc[0]) - elif f=="Close": - ratios.append(df_row[f] / df_fine_from_idx[f].iloc[-1]) + df_fine.loc[df_fine.index.isin(df_block.index), "ctr"] = 1 + df_fine["intervalID"] = df_fine["ctr"].cumsum() + df_fine = df_fine.drop("ctr", axis=1) + grp_col = "intervalID" + df_fine = df_fine[~df_fine[price_cols].isna().all(axis=1)] + + df_new = df_fine.groupby(grp_col).agg( + Open=("Open", "first"), + Close=("Close", "last"), + AdjClose=("Adj Close", "last"), + Low=("Low", "min"), + High=("High", "max"), + Volume=("Volume", "sum")).rename(columns={"AdjClose":"Adj Close"}) + if grp_col in ["Week Start", "Day Start"]: + df_new.index = df_new.index.tz_localize(df_fine.index.tz) + else: + df_fine["diff"] = df_fine["intervalID"].diff() + new_index = _np.append([df_fine.index[0]], df_fine.index[df_fine["intervalID"].diff()>0]) + df_new.index = new_index + + # Calibrate! Check whether 'df_fine' has different split-adjustment. + # If different, then adjust to match 'df' + df_block_calib = df_block[price_cols] + calib_filter = df_block_calib.to_numpy() != tag + if not calib_filter.any(): + # Can't calibrate so don't attempt repair + continue + df_new_calib = df_new[df_new.index.isin(df_block_calib.index)][price_cols] + ratios = (df_block_calib[price_cols].to_numpy() / df_new_calib[price_cols].to_numpy())[calib_filter] ratio = _np.mean(ratios) # - ratio_rcp = round(1.0/ratio, 1) ; ratio = round(ratio, 1) - if ratio==1 and ratio_rcp==1: + ratio_rcp = round(1.0 / ratio, 1) + ratio = round(ratio, 1) + if ratio == 1 and ratio_rcp == 1: # Good! pass else: - if ratio>1: + if ratio > 1: # data has different split-adjustment than fine-grained data # Adjust fine-grained to match - df_fine[data_cols] *= ratio - elif ratio_rcp>1: + df_new[price_cols] *= ratio + df_new["Volume"] /= ratio + elif ratio_rcp > 1: # data has different split-adjustment than fine-grained data # Adjust fine-grained to match - df_fine[data_cols] *= 1.0/ratio_rcp - - if sub_interval != "1h": - df_last_week = df_fine[df_fine.index=idx] - - if "High" in bad_fields: - new_vals["High"] = df_fine["High"].max() - if "Low" in bad_fields: - new_vals["Low"] = df_fine["Low"].min() - if "Open" in bad_fields: - if sub_interval != "1h" and idx != df_fine.index[0]: - # Exchange closed Monday. In this case, Yahoo sets Open to last week close - new_vals["Open"] = df_last_week["Close"][-1] - if "Low" in new_vals: - new_vals["Low"] = min(new_vals["Open"], new_vals["Low"]) - elif new_vals["Open"] < df_row["Low"]: - new_vals["Low"] = new_vals["Open"] - else: - new_vals["Open"] = df_fine["Open"].iloc[0] - if "Close" in bad_fields: - new_vals["Close"] = df_fine["Close"].iloc[-1] - # Assume 'Adj Close' also corrupted, easier than detecting whether true - new_vals["Adj Close"] = df_fine["Adj Close"].iloc[-1] + df_new[price_cols] *= 1.0 / ratio_rcp + df_new["Volume"] *= ratio_rcp + + # Repair! + bad_dts = df_block.index[(df_block[price_cols]==tag).any(axis=1)] + + for idx in bad_dts: + if not idx in df_new.index: + # Yahoo didn't return finer-grain data for this interval, + # so probably no trading happened. + print("no fine data") + continue + df_new_row = df_new.loc[idx] + + if interval == "1wk": + df_last_week = df_new.iloc[df_new.index.get_loc(idx)-1] + df_fine = df_fine.loc[idx:] + + df_bad_row = df.loc[idx] + bad_fields = df_bad_row.index[df_bad_row==tag].values + if "High" in bad_fields: + df_v2.loc[idx, "High"] = df_new_row["High"] + if "Low" in bad_fields: + df_v2.loc[idx, "Low"] = df_new_row["Low"] + if "Open" in bad_fields: + if interval == "1wk" and idx != df_fine.index[0]: + # Exchange closed Monday. In this case, Yahoo sets Open to last week close + df_v2.loc[idx, "Open"] = df_last_week["Close"] + df_v2.loc[idx, "Low"] = min(df_v2.loc[idx, "Open"], df_v2.loc[idx, "Low"]) + else: + df_v2.loc[idx, "Open"] = df_new_row["Open"] + if "Close" in bad_fields: + df_v2.loc[idx, "Close"] = df_new_row["Close"] + # Assume 'Adj Close' also corrupted, easier than detecting whether true + df_v2.loc[idx, "Adj Close"] = df_new_row["Adj Close"] + if "Volume" in bad_fields: + df_v2.loc[idx, "Volume"] = df_new_row["Volume"] + n_fixed += 1 - return new_vals + return df_v2 def _fix_unit_mixups(self, df, interval, tz_exchange): # Sometimes Yahoo returns few prices in cents/pence instead of $/£ @@ -498,66 +651,80 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): if (median == 0).any(): raise Exception("median contains zeroes, why?") ratio = df2[data_cols].values / median - ratio_rounded = (ratio / 20).round() * 20 # round ratio to nearest 20 + ratio_rounded = (ratio / 20).round() * 20 # round ratio to nearest 20 f = ratio_rounded == 100 + if not f.any(): + return df - # Store each mixup: - mixups = {} - for j in range(len(data_cols)): - fj = f[:, j] - if fj.any(): - dc = data_cols[j] - for i in _np.where(fj)[0]: - idx = df2.index[i] - if idx not in mixups: - mixups[idx] = {"data": df2.loc[idx, data_cols], "fields":{dc}} - else: - mixups[idx]["fields"].add(dc) - n_mixups = len(mixups) - - if len(mixups) > 0: - # This first pass will correct all errors in Open/Close/AdjClose columns. - # It will also attempt to correct Low/High columns, but only if can get price data. - for idx in sorted(list(mixups.keys())): - m = mixups[idx] - new_values = self._reconstruct_interval(df2.loc[idx], interval, m["fields"]) - if not new_values is None: - for k in new_values: - df2.loc[idx, k] = new_values[k] - del mixups[idx] + # Mark values to send for repair + tag = -1.0 + for i in range(len(data_cols)): + fi = f[:,i] + c = data_cols[i] + df2.loc[fi, c] = tag + + n_before = (df2[data_cols].to_numpy()==tag).sum() + df2 = self._reconstruct_intervals_batch(df2, interval, tag=tag) + n_after = (df2[data_cols].to_numpy()==tag).sum() + if n_after > 0: # This second pass will *crudely* "fix" any remaining errors in High/Low - # simply by ensuring they don't contradict e.g. Low = 100x High - if len(mixups) > 0: - for idx in sorted(list(mixups.keys())): - m = mixups[idx] - row = df2.loc[idx, ["Open", "Close"]] - if "High" in m["fields"]: - df2.loc[idx, "High"] = row.max() - m["fields"].remove("High") - if "Low" in m["fields"]: - df2.loc[idx, "Low"] = row.min() - m["fields"].remove("Low") - - if len(m["fields"]) == 0: - del mixups[idx] - - n_fixed = n_mixups - len(mixups) - print("{}: fixed {} currency unit mixups in {} price data".format(self.ticker, n_fixed, interval)) - if len(mixups) > 0: - print(" ... and failed to correct {}".format(len(mixups))) + # simply by ensuring they don't contradict e.g. Low = 100x High. + f = df2[data_cols].to_numpy()==tag + for i in range(f.shape[0]): + fi = f[i,:] + if not fi.any(): + continue + idx = df2.index[i] + + c = "Open" + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df.loc[idx, c] * 0.01 + # + c = "Close" + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df.loc[idx, c] * 0.01 + # + c = "High" + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].max() + # + c = "Low" + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].min() + + n_after_crude = (df2[data_cols].to_numpy()==tag).sum() + + n_fixed = n_before - n_after_crude + n_fixed_crudely = n_after - n_after_crude + if n_fixed > 0: + report_msg = f"{self.ticker}: fixed {n_fixed}/{n_before} currency unit mixups " + if n_fixed_crudely > 0: + report_msg += f"({n_fixed_crudely} crudely) " + report_msg += f"in {interval} price data" + print(report_msg) + + # Restore original values where repair failed + f = df2[data_cols].values==tag + for j in range(len(data_cols)): + fj = f[:,j] + if fj.any(): + c = data_cols[j] + df2.loc[fj, c] = df.loc[fj, c] return df2 - def _fix_zero_prices(self, df, interval, tz_exchange): - # Sometimes Yahoo returns prices=0 when obviously wrong e.g. Volume>0 and Close>0. - # Easy to detect and fix + def _fix_zeroes(self, df, interval, tz_exchange): + # Sometimes Yahoo returns prices=0 or NaN when trades occurred. + # But most times when prices=0 or NaN returned is because no trades. + # Impossible to distinguish, so only attempt repair if few or rare. if df.shape[0] == 0: return df - if df.shape[0] == 1: - # Need multiple rows to confidently identify outliers - return df df2 = df.copy() @@ -566,23 +733,45 @@ def _fix_zero_prices(self, df, interval, tz_exchange): else: df2.index = df2.index.tz_convert(tz_exchange) - data_cols = ["Open","High","Low","Close"] - data_cols = [c for c in data_cols if c in df2.columns] - f_zeroes = (df2[data_cols]==0.0).values.any(axis=1) - - n_fixed = 0 - for i in _np.where(f_zeroes)[0]: - idx = df2.index[i] - df_row = df2.loc[idx] - bad_fields = df2.columns[df_row.values==0.0].values - new_values = self._reconstruct_interval(df2.loc[idx], interval, bad_fields) - if not new_values is None: - for k in new_values: - df2.loc[idx, k] = new_values[k] - n_fixed += 1 + price_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df2.columns] + f_zero_or_nan = (df2[price_cols] == 0.0).values | df2[price_cols].isna().values + # Check whether worth attempting repair + if f_zero_or_nan.any(axis=1).sum() == 0: + return df + if f_zero_or_nan.sum() == len(price_cols)*len(df2): + # Need some good data to calibrate + return df + # - avoid repair if many zeroes/NaNs + pct_zero_or_nan = f_zero_or_nan.sum() / (len(price_cols)*len(df2)) + if f_zero_or_nan.any(axis=1).sum()>2 and pct_zero_or_nan > 0.05: + return df - if n_fixed>0: + data_cols = price_cols + ["Volume"] + + # Mark values to send for repair + tag = -1.0 + for i in range(len(price_cols)): + c = price_cols[i] + df2.loc[f_zero_or_nan[:,i], c] = tag + # If volume=0 or NaN for bad prices, then tag volume for repair + df2.loc[f_zero_or_nan.any(axis=1) & (df2["Volume"]==0), "Volume"] = tag + df2.loc[f_zero_or_nan.any(axis=1) & (df2["Volume"].isna()), "Volume"] = tag + + n_before = (df2[data_cols].to_numpy()==tag).sum() + df2 = self._reconstruct_intervals_batch(df2, interval, tag=tag) + n_after = (df2[data_cols].to_numpy()==tag).sum() + n_fixed = n_before - n_after + if n_fixed > 0: print("{}: fixed {} price=0.0 errors in {} price data".format(self.ticker, n_fixed, interval)) + + # Restore original values where repair failed (i.e. remove tag values) + f = df2[data_cols].values==tag + for j in range(len(data_cols)): + fj = f[:,j] + if fj.any(): + c = data_cols[j] + df2.loc[fj, c] = df.loc[fj, c] + return df2 def _get_ticker_tz(self, debug_mode, proxy, timeout): @@ -617,7 +806,7 @@ def _fetch_ticker_tz(self, debug_mode, proxy, timeout): url = "{}/v8/finance/chart/{}".format(self._base_url, self.ticker) try: - data = self._data.get(url=url, params=params, proxy=proxy, timeout=timeout) + data = self._data.cache_get(url=url, params=params, proxy=proxy, timeout=timeout) data = data.json() except Exception as e: if debug_mode: @@ -641,477 +830,100 @@ def _fetch_ticker_tz(self, debug_mode, proxy, timeout): print("-------------") return None - def _get_info(self, proxy=None): - if (self._info is not None) or (self._sustainability is not None) or self._recommendations: - # No need to fetch - return - - ticker_url = "{}/{}".format(self._scrape_url, self.ticker) - - # get info and sustainability - json_data = self._data.get_json_data_stores(ticker_url, proxy) - if 'QuoteSummaryStore' not in json_data: - err_msg = "No summary info found, symbol may be delisted" - print('- %s: %s' % (self.ticker, err_msg)) - return None - data = json_data['QuoteSummaryStore'] - - # sustainability - d = {} - try: - if isinstance(data.get('esgScores'), dict): - for item in data['esgScores']: - if not isinstance(data['esgScores'][item], (dict, list)): - d[item] = data['esgScores'][item] - - s = _pd.DataFrame(index=[0], data=d)[-1:].T - s.columns = ['Value'] - s.index.name = '%.f-%.f' % ( - s[s.index == 'ratingYear']['Value'].values[0], - s[s.index == 'ratingMonth']['Value'].values[0]) - - self._sustainability = s[~s.index.isin( - ['maxAge', 'ratingYear', 'ratingMonth'])] - except Exception: - pass - - # info (be nice to python 2) - self._info = {} - try: - items = ['summaryProfile', 'financialData', 'quoteType', - 'defaultKeyStatistics', 'assetProfile', 'summaryDetail'] - for item in items: - if isinstance(data.get(item), dict): - self._info.update(data[item]) - except Exception: - pass - - # For ETFs, provide this valuable data: the top holdings of the ETF - try: - if 'topHoldings' in data: - self._info.update(data['topHoldings']) - except Exception: - pass - - try: - if not isinstance(data.get('summaryDetail'), dict): - # For some reason summaryDetail did not give any results. The price dict - # usually has most of the same info - self._info.update(data.get('price', {})) - except Exception: - pass - - try: - # self._info['regularMarketPrice'] = self._info['regularMarketOpen'] - self._info['regularMarketPrice'] = data.get('price', {}).get( - 'regularMarketPrice', self._info.get('regularMarketOpen', None)) - except Exception: - pass - - try: - self._info['preMarketPrice'] = data.get('price', {}).get( - 'preMarketPrice', self._info.get('preMarketPrice', None)) - except Exception: - pass - - self._info['logo_url'] = "" - try: - if not 'website' in self._info: - self._info['logo_url'] = 'https://logo.clearbit.com/%s.com' % \ - self._info['shortName'].split(' ')[0].split(',')[0] - else: - domain = self._info['website'].split( - '://')[1].split('/')[0].replace('www.', '') - self._info['logo_url'] = 'https://logo.clearbit.com/%s' % domain - except Exception: - pass - - # events - try: - cal = _pd.DataFrame( - data['calendarEvents']['earnings']) - cal['earningsDate'] = _pd.to_datetime( - cal['earningsDate'], unit='s') - self._calendar = cal.T - self._calendar.index = utils.camel2title(self._calendar.index) - self._calendar.columns = ['Value'] - except Exception: - pass - - # analyst recommendations - try: - rec = _pd.DataFrame( - data['upgradeDowngradeHistory']['history']) - rec['earningsDate'] = _pd.to_datetime( - rec['epochGradeDate'], unit='s') - rec.set_index('earningsDate', inplace=True) - rec.index.name = 'Date' - rec.columns = utils.camel2title(rec.columns) - self._recommendations = rec[[ - 'Firm', 'To Grade', 'From Grade', 'Action']].sort_index() - except Exception: - pass - - # Complementary key-statistics. For now just want 'trailing PEG ratio' - keys = {"trailingPegRatio"} - if len(keys) > 0: - # Simplified the original scrape code for key-statistics. Very expensive for fetching - # just one value, best if scraping most/all: - # - # p = _re.compile(r'root\.App\.main = (.*);') - # url = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'.format(self.ticker, self.ticker) - # try: - # r = session.get(url, headers=utils.user_agent_headers) - # data = _json.loads(p.findall(r.text)[0]) - # key_stats = data['context']['dispatcher']['stores']['QuoteTimeSeriesStore']["timeSeries"] - # for k in keys: - # if k not in key_stats or len(key_stats[k])==0: - # # Yahoo website prints N/A, indicates Yahoo lacks necessary data to calculate - # v = None - # else: - # # Select most recent (last) raw value in list: - # v = key_stats[k][-1]["reportedValue"]["raw"] - # self._info[k] = v - # except Exception: - # raise - # pass - # - # For just one/few variable is faster to query directly: - url = "https://query1.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{}?symbol={}".format( - self.ticker, self.ticker) - for k in keys: - url += "&type=" + k - # Request 6 months of data - url += "&period1={}".format( - int((_datetime.datetime.now() - _datetime.timedelta(days=365 // 2)).timestamp())) - url += "&period2={}".format(int((_datetime.datetime.now() + _datetime.timedelta(days=1)).timestamp())) - - json_str = self._data.get(url=url, proxy=proxy).text - json_data = _json.loads(json_str) - key_stats = json_data["timeseries"]["result"][0] - if k not in key_stats: - # Yahoo website prints N/A, indicates Yahoo lacks necessary data to calculate - v = None - else: - # Select most recent (last) raw value in list: - v = key_stats[k][-1]["reportedValue"]["raw"] - self._info[k] = v - - def _get_fundamentals(self, proxy=None): - def cleanup(data): - ''' - The cleanup function is used for parsing yahoo finance json financial statement data into a pandas dataframe format. - ''' - df = _pd.DataFrame(data).drop(columns=['maxAge']) - for col in df.columns: - df[col] = _np.where( - df[col].astype(str) == '-', _np.nan, df[col]) - - df.set_index('endDate', inplace=True) - try: - df.index = _pd.to_datetime(df.index, unit='s') - except ValueError: - df.index = _pd.to_datetime(df.index) - df = df.T - df.columns.name = '' - df.index.name = 'Breakdown' - - # rename incorrect yahoo key - df.rename(index={'treasuryStock': 'Gains Losses Not Affecting Retained Earnings'}, inplace=True) - - df.index = utils.camel2title(df.index) - return df - - if self._fundamentals: - return - - ticker_url = "{}/{}".format(self._scrape_url, self.ticker) - - # holders - try: - resp = self._data.get(ticker_url + '/holders', proxy) - holders = _pd.read_html(resp.text) - except Exception: - holders = [] - - if len(holders) >= 3: - self._major_holders = holders[0] - self._institutional_holders = holders[1] - self._mutualfund_holders = holders[2] - elif len(holders) >= 2: - self._major_holders = holders[0] - self._institutional_holders = holders[1] - elif len(holders) >= 1: - self._major_holders = holders[0] - - if self._institutional_holders is not None: - if 'Date Reported' in self._institutional_holders: - self._institutional_holders['Date Reported'] = _pd.to_datetime( - self._institutional_holders['Date Reported']) - if '% Out' in self._institutional_holders: - self._institutional_holders['% Out'] = self._institutional_holders[ - '% Out'].str.replace('%', '').astype(float) / 100 - - if self._mutualfund_holders is not None: - if 'Date Reported' in self._mutualfund_holders: - self._mutualfund_holders['Date Reported'] = _pd.to_datetime( - self._mutualfund_holders['Date Reported']) - if '% Out' in self._mutualfund_holders: - self._mutualfund_holders['% Out'] = self._mutualfund_holders[ - '% Out'].str.replace('%', '').astype(float) / 100 - - self._get_info(proxy) - - # get fundamentals - self._earnings = {"yearly": utils._pd.DataFrame(), "quarterly": utils._pd.DataFrame()} - self._financials = {} - for name in ["income", "balance-sheet", "cash-flow"]: - self._financials[name] = {"yearly": utils._pd.DataFrame(), "quarterly": utils._pd.DataFrame()} - - financials_data = self._data.get_json_data_stores(ticker_url + '/financials', proxy) - if not "QuoteSummaryStore" in financials_data: - err_msg = "No financials data found, symbol may be delisted" - print('- %s: %s' % (self.ticker, err_msg)) - return None - fin_data_quote = financials_data['QuoteSummaryStore'] - - # generic patterns - for name in ["income", "balance-sheet", "cash-flow"]: - annual, qtr = self._create_financials_table(name, proxy) - if annual is not None: - self._financials[name]["yearly"] = annual - if qtr is not None: - self._financials[name]["quarterly"] = qtr - - # earnings - if isinstance(fin_data_quote.get('earnings'), dict): - try: - earnings = fin_data_quote['earnings']['financialsChart'] - earnings['financialCurrency'] = fin_data_quote['earnings'].get('financialCurrency', 'USD') - self._earnings['financialCurrency'] = earnings['financialCurrency'] - df = _pd.DataFrame(earnings['yearly']).set_index('date') - df.columns = utils.camel2title(df.columns) - df.index.name = 'Year' - self._earnings['yearly'] = df - - df = _pd.DataFrame(earnings['quarterly']).set_index('date') - df.columns = utils.camel2title(df.columns) - df.index.name = 'Quarter' - self._earnings['quarterly'] = df - except Exception: - pass - - # shares outstanding - try: - # keep only years with non None data - available_shares = [shares_data for shares_data in - financials_data['QuoteTimeSeriesStore']['timeSeries']['annualBasicAverageShares'] if - shares_data] - shares = _pd.DataFrame(available_shares) - shares['Year'] = shares['asOfDate'].agg(lambda x: int(x[:4])) - shares.set_index('Year', inplace=True) - shares.drop(columns=['dataId', 'asOfDate', - 'periodType', 'currencyCode'], inplace=True) - shares.rename( - columns={'reportedValue': "BasicShares"}, inplace=True) - self._shares = shares - except Exception: - pass - - # Analysis - data = self._data.get_json_data_stores(ticker_url + '/analysis', proxy)["QuoteSummaryStore"] - - if isinstance(data.get('earningsTrend'), dict): - try: - analysis = _pd.DataFrame(data['earningsTrend']['trend']) - analysis['endDate'] = _pd.to_datetime(analysis['endDate']) - analysis.set_index('period', inplace=True) - analysis.index = analysis.index.str.upper() - analysis.index.name = 'Period' - analysis.columns = utils.camel2title(analysis.columns) - - dict_cols = [] - - for idx, row in analysis.iterrows(): - for colname, colval in row.items(): - if isinstance(colval, dict): - dict_cols.append(colname) - for k, v in colval.items(): - new_colname = colname + ' ' + \ - utils.camel2title([k])[0] - analysis.loc[idx, new_colname] = v - - self._earnings_trend = analysis[[ - c for c in analysis.columns if c not in dict_cols]] - except Exception: - pass - - # Analysis Data/Analyst Forecasts - try: - analysis_data = self._data.get_json_data_stores(ticker_url + '/analysis', proxy) - analysis_data = analysis_data['QuoteSummaryStore'] - except Exception as e: - analysis_data = {} - try: - self._analyst_trend_details = _pd.DataFrame(analysis_data['recommendationTrend']['trend']) - except Exception as e: - self._analyst_trend_details = None - try: - self._analyst_price_target = _pd.DataFrame(analysis_data['financialData'], index=[0])[ - ['targetLowPrice', 'currentPrice', 'targetMeanPrice', 'targetHighPrice', 'numberOfAnalystOpinions']].T - except Exception as e: - self._analyst_price_target = None - earnings_estimate = [] - revenue_estimate = [] - if len(self._analyst_trend_details) != 0: - for key in analysis_data['earningsTrend']['trend']: - try: - earnings_dict = key['earningsEstimate'] - earnings_dict['period'] = key['period'] - earnings_dict['endDate'] = key['endDate'] - earnings_estimate.append(earnings_dict) - - revenue_dict = key['revenueEstimate'] - revenue_dict['period'] = key['period'] - revenue_dict['endDate'] = key['endDate'] - revenue_estimate.append(revenue_dict) - except Exception as e: - pass - self._rev_est = _pd.DataFrame(revenue_estimate) - self._eps_est = _pd.DataFrame(earnings_estimate) - else: - self._rev_est = _pd.DataFrame() - self._eps_est = _pd.DataFrame() - - self._fundamentals = True - - def _create_financials_table(self, name, proxy): - acceptable_names = ["income", "balance-sheet", "cash-flow"] - if not name in acceptable_names: - raise Exception("name '{}' must be one of: {}".format(name, acceptable_names)) - - if name == "income": - # Yahoo stores the 'income' table internally under 'financials' key - name = "financials" - - ticker_url = "{}/{}".format(self._scrape_url, self.ticker) - data_stores = self._data.get_json_data_stores(ticker_url + '/' + name, proxy) - _stmt_annual = None - _stmt_qtr = None - try: - # Developers note: TTM and template stuff allows for reproducing the nested structure - # visible on Yahoo website. But more work needed to make it user-friendly! Ideally - # return a tree data structure instead of Pandas MultiIndex - # So until this is implemented, just return simple tables - _stmt_annual = self._data.get_financials_time_series("annual", data_stores, proxy) - _stmt_qtr = self._data.get_financials_time_series("quarterly", data_stores, proxy) - - # template_ttm_order, template_annual_order, template_order, level_detail = utils.build_template(data_store["FinancialTemplateStore"]) - # TTM_dicts, Annual_dicts = utils.retreive_financial_details(data_store['QuoteTimeSeriesStore']) - # if name == "balance-sheet": - # # Note: balance sheet is the only financial statement with no ttm detail - # _stmt_annual = utils.format_annual_financial_statement(level_detail, Annual_dicts, template_annual_order) - # else: - # _stmt_annual = utils.format_annual_financial_statement(level_detail, Annual_dicts, template_annual_order, TTM_dicts, template_ttm_order) - - # Data store doesn't contain quarterly data, so retrieve using different url: - # _qtr_data = utils.get_financials_time_series(self.ticker, name, "quarterly", ticker_url, proxy, self.session) - # _stmt_qtr = utils.format_quarterly_financial_statement(_qtr_data, level_detail, template_order) - - except Exception as e: - pass - - return _stmt_annual, _stmt_qtr - def get_recommendations(self, proxy=None, as_dict=False): - self._get_info(proxy) - data = self._recommendations + self._quote.proxy = proxy + data = self._quote.recommendations if as_dict: return data.to_dict() return data def get_calendar(self, proxy=None, as_dict=False): - self._get_info(proxy) - data = self._calendar + self._quote.proxy = proxy + data = self._quote.calendar if as_dict: return data.to_dict() return data def get_major_holders(self, proxy=None, as_dict=False): - self._get_fundamentals(proxy=proxy) - data = self._major_holders + self._holders.proxy = proxy + data = self._holders.major if as_dict: return data.to_dict() return data def get_institutional_holders(self, proxy=None, as_dict=False): - self._get_fundamentals(proxy=proxy) - data = self._institutional_holders + self._holders.proxy = proxy + data = self._holders.institutional if data is not None: if as_dict: return data.to_dict() return data def get_mutualfund_holders(self, proxy=None, as_dict=False): - self._get_fundamentals(proxy=proxy) - data = self._mutualfund_holders + self._holders.proxy = proxy + data = self._holders.mutualfund if data is not None: if as_dict: return data.to_dict() return data - def get_info(self, proxy=None, as_dict=False): - self._get_info(proxy) - data = self._info - if as_dict: - return data.to_dict() + def get_info(self, proxy=None) -> dict: + self._quote.proxy = proxy + data = self._quote.info return data def get_sustainability(self, proxy=None, as_dict=False): - self._get_info(proxy) - data = self._sustainability + self._quote.proxy = proxy + data = self._quote.sustainability if as_dict: return data.to_dict() return data - def get_recommendations_summary(self, proxy=None, as_dict=False, *args, **kwargs): - self._get_fundamentals(proxy=proxy) - data = self._analyst_trend_details + def get_recommendations_summary(self, proxy=None, as_dict=False): + self._quote.proxy = proxy + data = self._quote.recommendations if as_dict: return data.to_dict() return data - def get_analyst_price_target(self, proxy=None, as_dict=False, *args, **kwargs): - self._get_fundamentals(proxy=proxy) - data = self._analyst_price_target + def get_analyst_price_target(self, proxy=None, as_dict=False): + self._analysis.proxy = proxy + data = self._analysis.analyst_price_target if as_dict: return data.to_dict() return data - def get_rev_forecast(self, proxy=None, as_dict=False, *args, **kwargs): - self._get_fundamentals(proxy=proxy) - data = self._rev_est + def get_rev_forecast(self, proxy=None, as_dict=False): + self._analysis.proxy = proxy + data = self._analysis.rev_est if as_dict: return data.to_dict() return data def get_earnings_forecast(self, proxy=None, as_dict=False): - self._get_fundamentals(proxy=proxy) - data = self._eps_est + self._analysis.proxy = proxy + data = self._analysis.eps_est if as_dict: return data.to_dict() return data - def get_earnings_trend(self, proxy=None, as_dict=False, *args, **kwargs): - self._get_fundamentals(proxy=proxy) - data = self._earnings_trend + def get_trend_details(self, proxy=None, as_dict=False): + self._analysis.proxy = proxy + data = self._analysis.analyst_trend_details + if as_dict: + return data.to_dict() + return data + + def get_earnings_trend(self, proxy=None, as_dict=False): + self._analysis.proxy = proxy + data = self._analysis.earnings_trend if as_dict: return data.to_dict() return data def get_earnings(self, proxy=None, as_dict=False, freq="yearly"): - self._get_fundamentals(proxy=proxy) - data = self._earnings[freq] + self._fundamentals.proxy = proxy + data = self._fundamentals.earnings[freq] if as_dict: dict_data = data.to_dict() dict_data['financialCurrency'] = 'USD' if 'financialCurrency' not in self._earnings else self._earnings[ @@ -1119,23 +931,47 @@ def get_earnings(self, proxy=None, as_dict=False, freq="yearly"): return dict_data return data - def get_income_stmt(self, proxy=None, as_dict=False, freq="yearly"): - self._get_fundamentals(proxy=proxy) - data = self._financials["income"][freq] + def get_income_stmt(self, proxy=None, as_dict=False, pretty=False, freq="yearly", legacy=False): + self._fundamentals.proxy = proxy + + if legacy: + data = self._fundamentals.financials.get_income_scrape(freq=freq, proxy=proxy) + else: + data = self._fundamentals.financials.get_income_time_series(freq=freq, proxy=proxy) + + if pretty: + data = data.copy() + data.index = utils.camel2title(data.index, sep=' ', acronyms=["EBIT", "EBITDA", "EPS", "NI"]) if as_dict: return data.to_dict() return data - def get_balance_sheet(self, proxy=None, as_dict=False, freq="yearly"): - self._get_fundamentals(proxy=proxy) - data = self._financials["balance-sheet"][freq] + def get_balance_sheet(self, proxy=None, as_dict=False, pretty=False, freq="yearly", legacy=False): + self._fundamentals.proxy = proxy + + if legacy: + data = self._fundamentals.financials.get_balance_sheet_scrape(freq=freq, proxy=proxy) + else: + data = self._fundamentals.financials.get_balance_sheet_time_series(freq=freq, proxy=proxy) + + if pretty: + data = data.copy() + data.index = utils.camel2title(data.index, sep=' ', acronyms=["PPE"]) if as_dict: return data.to_dict() return data - def get_cashflow(self, proxy=None, as_dict=False, freq="yearly"): - self._get_fundamentals(proxy=proxy) - data = self._financials["cash-flow"][freq] + def get_cashflow(self, proxy=None, as_dict=False, pretty=False, freq="yearly", legacy=False): + self._fundamentals.proxy = proxy + + if legacy: + data = self._fundamentals.financials.get_cash_flow_scrape(freq=freq, proxy=proxy) + else: + data = self._fundamentals.financials.get_cash_flow_time_series(freq=freq, proxy=proxy) + + if pretty: + data = data.copy() + data.index = utils.camel2title(data.index, sep=' ', acronyms=["PPE"]) if as_dict: return data.to_dict() return data @@ -1148,6 +984,14 @@ def get_dividends(self, proxy=None): return dividends[dividends != 0] return [] + def get_capital_gains(self, proxy=None): + if self._history is None: + self.history(period="max", proxy=proxy) + if self._history is not None and "Capital Gains" in self._history: + capital_gains = self._history["Capital Gains"] + return capital_gains[capital_gains != 0] + return [] + def get_splits(self, proxy=None): if self._history is None: self.history(period="max", proxy=proxy) @@ -1160,18 +1004,21 @@ def get_actions(self, proxy=None): if self._history is None: self.history(period="max", proxy=proxy) if self._history is not None and "Dividends" in self._history and "Stock Splits" in self._history: - actions = self._history[["Dividends", "Stock Splits"]] + action_columns = ["Dividends", "Stock Splits"] + if "Capital Gains" in self._history: + action_columns.append("Capital Gains") + actions = self._history[action_columns] return actions[actions != 0].dropna(how='all').fillna(0) return [] def get_shares(self, proxy=None, as_dict=False): - self._get_fundamentals(proxy=proxy) - data = self._shares + self._fundamentals.proxy = proxy + data = self._fundamentals.shares if as_dict: return data.to_dict() return data - def get_isin(self, proxy=None): + def get_isin(self, proxy=None) -> Optional[str]: # *** experimental *** if self._isin is not None: return self._isin @@ -1183,17 +1030,18 @@ def get_isin(self, proxy=None): return self._isin q = ticker - self.get_info(proxy=proxy) - if self._info is None: - # Don't print error message cause _get_info() will print one + + self._quote.proxy = proxy + if self._quote.info is None: + # Don't print error message cause self._quote.info will print one return None - if "shortName" in self._info: - q = self._info['shortName'] + if "shortName" in self._quote.info: + q = self._quote.info['shortName'] url = 'https://markets.businessinsider.com/ajax/' \ 'SearchController_Suggest?max_results=25&query=%s' \ % urlencode(q) - data = self._data.get(url=url, proxy=proxy).text + data = self._data.cache_get(url=url, proxy=proxy).text search_str = '"{}|'.format(ticker) if search_str not in data: @@ -1215,7 +1063,7 @@ def get_news(self, proxy=None): # Getting data from json url = "{}/v1/finance/search?q={}".format(self._base_url, self.ticker) - data = self._data.get(url=url, proxy=proxy) + data = self._data.cache_get(url=url, proxy=proxy) if "Will be right back" in data.text: raise RuntimeError("*** YAHOO! FINANCE IS CURRENTLY DOWN! ***\n" "Our engineers are working quickly to resolve " @@ -1226,18 +1074,27 @@ def get_news(self, proxy=None): self._news = data.get("news", []) return self._news - def get_earnings_dates(self, proxy=None): - if self._earnings_dates is not None: - return self._earnings_dates + def get_earnings_dates(self, limit=12, proxy=None) -> Optional[pd.DataFrame]: + """ + Get earning dates (future and historic) + :param limit: max amount of upcoming and recent earnings dates to return. + Default value 12 should return next 4 quarters and last 8 quarters. + Increase if more history is needed. - page_size = 100 # YF caps at 100, don't go higher + :param proxy: requests proxy to use. + :return: pandas dataframe + """ + if self._earnings_dates and limit in self._earnings_dates: + return self._earnings_dates[limit] + + page_size = min(limit, 100) # YF caps at 100, don't go higher page_offset = 0 dates = None while True: url = "{}/calendar/earnings?symbol={}&offset={}&size={}".format( _ROOT_URL_, self.ticker, page_offset, page_size) - data = self._data.get(url=url, proxy=proxy).text + data = self._data.cache_get(url=url, proxy=proxy).text if "Will be right back" in data: raise RuntimeError("*** YAHOO! FINANCE IS CURRENTLY DOWN! ***\n" @@ -1253,14 +1110,21 @@ def get_earnings_dates(self, proxy=None): # Actually YF was successful, problem is company doesn't have earnings history dates = utils.empty_earnings_dates_df() break - if dates is None: dates = data else: dates = _pd.concat([dates, data], axis=0) + page_offset += page_size + # got less data then we asked for or already fetched all we requested, no need to fetch more pages + if len(data) < page_size or len(dates) >= limit: + dates = dates.iloc[:limit] + break + else: + # do not fetch more than needed next time + page_size = min(limit - len(dates), page_size) - if dates is None or dates.shape[0]==0: + if dates is None or dates.shape[0] == 0: err_msg = "No earnings dates found, symbol may be delisted" print('- %s: %s' % (self.ticker, err_msg)) return None @@ -1289,37 +1153,18 @@ def get_earnings_dates(self, proxy=None): dates[cn] = dates[cn] + ' ' + tzinfo["AM/PM"] dates[cn] = _pd.to_datetime(dates[cn], format="%b %d, %Y, %I %p") # - instead of attempting decoding of ambiguous timezone abbreviation, just use 'info': + self._quote.proxy = proxy dates[cn] = dates[cn].dt.tz_localize( - tz=self.get_info()["exchangeTimezoneName"]) + tz=self._quote.info["exchangeTimezoneName"]) dates = dates.set_index("Earnings Date") - self._earnings_dates = dates + self._earnings_dates[limit] = dates return dates - def get_earnings_history(self, proxy=None): - if self._earnings_history is not None: - return self._earnings_history - - url = "{}/calendar/earnings?symbol={}".format(_ROOT_URL_, self.ticker) - data = self._data.get(url=url, proxy=proxy).text - - if "Will be right back" in data: - raise RuntimeError("*** YAHOO! FINANCE IS CURRENTLY DOWN! ***\n" - "Our engineers are working quickly to resolve " - "the issue. Thank you for your patience.") - - try: - # read_html returns a list of pandas Dataframes of all the tables in `data` - data = _pd.read_html(data)[0] - data.replace("-", _np.nan, inplace=True) - - data['EPS Estimate'] = _pd.to_numeric(data['EPS Estimate']) - data['Reported EPS'] = _pd.to_numeric(data['Reported EPS']) - self._earnings_history = data - # if no tables are found a ValueError is thrown - except ValueError: - print("Could not find earnings history data for {}.".format(self.ticker)) - return - return data + def get_history_metadata(self) -> dict: + if self._history_metadata is None: + raise RuntimeError("Metadata was never retrieved so far, " + "call history() to retrieve it") + return self._history_metadata \ No newline at end of file diff --git a/yfinance/data.py b/yfinance/data.py index acbe02964..fb507d50b 100644 --- a/yfinance/data.py +++ b/yfinance/data.py @@ -1,8 +1,6 @@ -import datetime import functools from functools import lru_cache -import pandas as pd import requests as requests import re @@ -18,14 +16,16 @@ def lru_cache_freezeargs(func): """ - Decorator transforms mutable dictionary arguments into immutable - Needed so lru_cache can cache method calls what has dict arguments. + Decorator transforms mutable dictionary and list arguments into immutable types + Needed so lru_cache can cache method calls what has dict or list arguments. """ @functools.wraps(func) def wrapped(*args, **kwargs): args = tuple([frozendict(arg) if isinstance(arg, dict) else arg for arg in args]) kwargs = {k: frozendict(v) if isinstance(v, dict) else v for k, v in kwargs.items()} + args = tuple([tuple(arg) if isinstance(arg, list) else arg for arg in args]) + kwargs = {k: tuple(v) if isinstance(v, list) else v for k, v in kwargs.items()} return func(*args, **kwargs) # copy over the lru_cache extra methods to this wrapper to be able to access them @@ -35,6 +35,9 @@ def wrapped(*args, **kwargs): return wrapped +_SCRAPE_URL_ = 'https://finance.yahoo.com/quote' + + class TickerData: """ Have one place to retrieve data from Yahoo API in order to ease caching and speed up operations @@ -43,11 +46,9 @@ class TickerData: 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} def __init__(self, ticker: str, session=None): - self._ticker = ticker + self.ticker = ticker self._session = session or requests - @lru_cache_freezeargs - @lru_cache(maxsize=cache_maxsize) def get(self, url, user_agent_headers=None, params=None, proxy=None, timeout=30): proxy = self._get_proxy(proxy) response = self._session.get( @@ -58,6 +59,11 @@ def get(self, url, user_agent_headers=None, params=None, proxy=None, timeout=30) headers=user_agent_headers or self.user_agent_headers) return response + @lru_cache_freezeargs + @lru_cache(maxsize=cache_maxsize) + def cache_get(self, url, user_agent_headers=None, params=None, proxy=None, timeout=30): + return self.get(url, user_agent_headers, params, proxy, timeout) + def _get_proxy(self, proxy): # setup proxy in requests format if proxy is not None: @@ -68,12 +74,18 @@ def _get_proxy(self, proxy): @lru_cache_freezeargs @lru_cache(maxsize=cache_maxsize) - def get_json_data_stores(self, url, proxy=None): + def get_json_data_stores(self, sub_page: str = None, proxy=None) -> dict: ''' get_json_data_stores returns a python dictionary of the data stores in yahoo finance web page. ''' - html = self.get(url=url, proxy=proxy).text + if sub_page: + ticker_url = "{}/{}/{}".format(_SCRAPE_URL_, self.ticker, sub_page) + else: + ticker_url = "{}/{}".format(_SCRAPE_URL_, self.ticker) + html = self.get(url=ticker_url, proxy=proxy).text + + # The actual json-data for stores is in a javascript assignment in the webpage json_str = html.split('root.App.main =')[1].split( '(this)')[0].split(';\n}')[0].strip() data = json.loads(json_str)['context']['dispatcher']['stores'] @@ -84,70 +96,3 @@ def get_json_data_stores(self, url, proxy=None): r'{[\'|\"]raw[\'|\"]:(.*?),(.*?)}', r'\1', new_data) return json.loads(new_data) - - # Note cant use lru_cache as financials_data is a nested dict (freezeargs only handle flat dicts) - def get_financials_time_series(self, timescale, financials_data, proxy=None): - - acceptable_timestamps = ["annual", "quarterly"] - if timescale not in acceptable_timestamps: - raise Exception("timescale '{}' must be one of: {}".format(timescale, acceptable_timestamps)) - - # Step 1: get the keys: - def _finditem1(key, obj): - values = [] - if isinstance(obj, dict): - if key in obj.keys(): - values.append(obj[key]) - for k, v in obj.items(): - values += _finditem1(key, v) - elif isinstance(obj, list): - for v in obj: - values += _finditem1(key, v) - return values - - keys = _finditem1("key", financials_data['FinancialTemplateStore']) - - # Step 2: construct url: - ts_url_base = "https://query2.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{0}?symbol={0}".format( - self._ticker) - if len(keys) == 0: - raise Exception("Fetching keys failed") - url = ts_url_base + "&type=" + ",".join([timescale + k for k in keys]) - # Yahoo returns maximum 4 years or 5 quarters, regardless of start_dt: - start_dt = datetime.datetime(2016, 12, 31) - end = (datetime.datetime.now() + datetime.timedelta(days=366)) - url += "&period1={}&period2={}".format(int(start_dt.timestamp()), int(end.timestamp())) - - # Step 3: fetch and reshape data - json_str = self.get(url=url, proxy=proxy).text - json_data = json.loads(json_str) - data_raw = json_data["timeseries"]["result"] - # data_raw = [v for v in data_raw if len(v) > 1] # Discard keys with no data - for d in data_raw: - del d["meta"] - - # Now reshape data into a table: - # Step 1: get columns and index: - timestamps = set() - data_unpacked = {} - for x in data_raw: - for k in x.keys(): - if k == "timestamp": - timestamps.update(x[k]) - else: - data_unpacked[k] = x[k] - timestamps = sorted(list(timestamps)) - dates = pd.to_datetime(timestamps, unit="s") - df = pd.DataFrame(columns=dates, index=list(data_unpacked.keys())) - for k, v in data_unpacked.items(): - if df is None: - df = pd.DataFrame(columns=dates, index=[k]) - df.loc[k] = {pd.Timestamp(x["asOfDate"]): x["reportedValue"]["raw"] for x in v} - - df.index = df.index.str.replace("^" + timescale, "", regex=True) - - # Reorder table to match order on Yahoo website - df = df.reindex([k for k in keys if k in df.index]) - df = df[sorted(df.columns, reverse=True)] - - return df diff --git a/yfinance/exceptions.py b/yfinance/exceptions.py new file mode 100644 index 000000000..866a3c620 --- /dev/null +++ b/yfinance/exceptions.py @@ -0,0 +1,6 @@ +class YFianceException(Exception): + pass + + +class YFianceDataException(YFianceException): + pass diff --git a/yfinance/multi.py b/yfinance/multi.py index 9a36df2df..7d2d9dbac 100644 --- a/yfinance/multi.py +++ b/yfinance/multi.py @@ -199,10 +199,16 @@ def _download_one_threaded(ticker, start=None, end=None, actions=False, progress=True, period="max", interval="1d", prepost=False, proxy=None, keepna=False, rounding=False, timeout=10): - data = _download_one(ticker, start, end, auto_adjust, back_adjust, repair, - actions, period, interval, prepost, proxy, rounding, - keepna, timeout) - shared._DFS[ticker.upper()] = data + try: + data = _download_one(ticker, start, end, auto_adjust, back_adjust, repair, + actions, period, interval, prepost, proxy, rounding, + keepna, timeout) + except Exception as e: + # glob try/except needed as current thead implementation breaks if exception is raised. + shared._DFS[ticker] = utils.empty_df() + shared._ERRORS[ticker] = repr(e) + else: + shared._DFS[ticker.upper()] = data if progress: shared._PROGRESS_BAR.animate() diff --git a/yfinance/scrapers/__init__.py b/yfinance/scrapers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/yfinance/scrapers/analysis.py b/yfinance/scrapers/analysis.py new file mode 100644 index 000000000..e381b01f1 --- /dev/null +++ b/yfinance/scrapers/analysis.py @@ -0,0 +1,118 @@ +import pandas as pd + +from yfinance import utils +from yfinance.data import TickerData + + +class Analysis: + + def __init__(self, data: TickerData, proxy=None): + self._data = data + self.proxy = proxy + + self._earnings_trend = None + self._analyst_trend_details = None + self._analyst_price_target = None + self._rev_est = None + self._eps_est = None + self._already_scraped = False + + @property + def earnings_trend(self) -> pd.DataFrame: + if self._earnings_trend is None: + self._scrape(self.proxy) + return self._earnings_trend + + @property + def analyst_trend_details(self) -> pd.DataFrame: + if self._analyst_trend_details is None: + self._scrape(self.proxy) + return self._analyst_trend_details + + @property + def analyst_price_target(self) -> pd.DataFrame: + if self._analyst_price_target is None: + self._scrape(self.proxy) + return self._analyst_price_target + + @property + def rev_est(self) -> pd.DataFrame: + if self._rev_est is None: + self._scrape(self.proxy) + return self._rev_est + + @property + def eps_est(self) -> pd.DataFrame: + if self._eps_est is None: + self._scrape(self.proxy) + return self._eps_est + + def _scrape(self, proxy): + if self._already_scraped: + return + self._already_scraped = True + + # Analysis Data/Analyst Forecasts + analysis_data = self._data.get_json_data_stores("analysis", proxy=proxy) + try: + analysis_data = analysis_data['QuoteSummaryStore'] + except KeyError as e: + err_msg = "No analysis data found, symbol may be delisted" + print('- %s: %s' % (self._data.ticker, err_msg)) + return + + if isinstance(analysis_data.get('earningsTrend'), dict): + try: + analysis = pd.DataFrame(analysis_data['earningsTrend']['trend']) + analysis['endDate'] = pd.to_datetime(analysis['endDate']) + analysis.set_index('period', inplace=True) + analysis.index = analysis.index.str.upper() + analysis.index.name = 'Period' + analysis.columns = utils.camel2title(analysis.columns) + + dict_cols = [] + + for idx, row in analysis.iterrows(): + for colname, colval in row.items(): + if isinstance(colval, dict): + dict_cols.append(colname) + for k, v in colval.items(): + new_colname = colname + ' ' + \ + utils.camel2title([k])[0] + analysis.loc[idx, new_colname] = v + + self._earnings_trend = analysis[[ + c for c in analysis.columns if c not in dict_cols]] + except Exception: + pass + + try: + self._analyst_trend_details = pd.DataFrame(analysis_data['recommendationTrend']['trend']) + except Exception as e: + self._analyst_trend_details = None + try: + self._analyst_price_target = pd.DataFrame(analysis_data['financialData'], index=[0])[ + ['targetLowPrice', 'currentPrice', 'targetMeanPrice', 'targetHighPrice', 'numberOfAnalystOpinions']].T + except Exception as e: + self._analyst_price_target = None + earnings_estimate = [] + revenue_estimate = [] + if self._analyst_trend_details is not None : + for key in analysis_data['earningsTrend']['trend']: + try: + earnings_dict = key['earningsEstimate'] + earnings_dict['period'] = key['period'] + earnings_dict['endDate'] = key['endDate'] + earnings_estimate.append(earnings_dict) + + revenue_dict = key['revenueEstimate'] + revenue_dict['period'] = key['period'] + revenue_dict['endDate'] = key['endDate'] + revenue_estimate.append(revenue_dict) + except Exception as e: + pass + self._rev_est = pd.DataFrame(revenue_estimate) + self._eps_est = pd.DataFrame(earnings_estimate) + else: + self._rev_est = pd.DataFrame() + self._eps_est = pd.DataFrame() diff --git a/yfinance/scrapers/fundamentals.py b/yfinance/scrapers/fundamentals.py new file mode 100644 index 000000000..42f10194a --- /dev/null +++ b/yfinance/scrapers/fundamentals.py @@ -0,0 +1,323 @@ +import datetime +import json + +import pandas as pd +import numpy as np + +from yfinance import utils +from yfinance.data import TickerData +from yfinance.exceptions import YFianceDataException, YFianceException + + +class Fundamentals: + + def __init__(self, data: TickerData, proxy=None): + self._data = data + self.proxy = proxy + + self._earnings = None + self._financials = None + self._shares = None + + self._financials_data = None + self._fin_data_quote = None + self._basics_already_scraped = False + self._financials = Fiancials(data) + + @property + def financials(self) -> "Fiancials": + return self._financials + + @property + def earnings(self) -> dict: + if self._earnings is None: + self._scrape_earnings(self.proxy) + return self._earnings + + @property + def shares(self) -> pd.DataFrame: + if self._shares is None: + self._scrape_shares(self.proxy) + return self._shares + + def _scrape_basics(self, proxy): + if self._basics_already_scraped: + return + self._basics_already_scraped = True + + self._financials_data = self._data.get_json_data_stores('financials', proxy) + try: + self._fin_data_quote = self._financials_data['QuoteSummaryStore'] + except KeyError: + err_msg = "No financials data found, symbol may be delisted" + print('- %s: %s' % (self._data.ticker, err_msg)) + return None + + def _scrape_earnings(self, proxy): + self._scrape_basics(proxy) + # earnings + self._earnings = {"yearly": pd.DataFrame(), "quarterly": pd.DataFrame()} + if self._fin_data_quote is None: + return + if isinstance(self._fin_data_quote.get('earnings'), dict): + try: + earnings = self._fin_data_quote['earnings']['financialsChart'] + earnings['financialCurrency'] = self._fin_data_quote['earnings'].get('financialCurrency', 'USD') + self._earnings['financialCurrency'] = earnings['financialCurrency'] + df = pd.DataFrame(earnings['yearly']).set_index('date') + df.columns = utils.camel2title(df.columns) + df.index.name = 'Year' + self._earnings['yearly'] = df + + df = pd.DataFrame(earnings['quarterly']).set_index('date') + df.columns = utils.camel2title(df.columns) + df.index.name = 'Quarter' + self._earnings['quarterly'] = df + except Exception: + pass + + def _scrape_shares(self, proxy): + self._scrape_basics(proxy) + # shares outstanding + try: + # keep only years with non None data + available_shares = [shares_data for shares_data in + self._financials_data['QuoteTimeSeriesStore']['timeSeries']['annualBasicAverageShares'] + if + shares_data] + shares = pd.DataFrame(available_shares) + shares['Year'] = shares['asOfDate'].agg(lambda x: int(x[:4])) + shares.set_index('Year', inplace=True) + shares.drop(columns=['dataId', 'asOfDate', + 'periodType', 'currencyCode'], inplace=True) + shares.rename( + columns={'reportedValue': "BasicShares"}, inplace=True) + self._shares = shares + except Exception: + pass + + +class Fiancials: + def __init__(self, data: TickerData): + self._data = data + self._income_time_series = {} + self._balance_sheet_time_series = {} + self._cash_flow_time_series = {} + self._income_scraped = {} + self._balance_sheet_scraped = {} + self._cash_flow_scraped = {} + + def get_income_time_series(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._income_time_series + if freq not in res: + res[freq] = self._fetch_time_series("income", freq, proxy=None) + return res[freq] + + def get_balance_sheet_time_series(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._balance_sheet_time_series + if freq not in res: + res[freq] = self._fetch_time_series("balance-sheet", freq, proxy=None) + return res[freq] + + def get_cash_flow_time_series(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._cash_flow_time_series + if freq not in res: + res[freq] = self._fetch_time_series("cash-flow", freq, proxy=None) + return res[freq] + + def _fetch_time_series(self, name, timescale, proxy=None): + # Fetching time series preferred over scraping 'QuoteSummaryStore', + # because it matches what Yahoo shows. But for some tickers returns nothing, + # despite 'QuoteSummaryStore' containing valid data. + + allowed_names = ["income", "balance-sheet", "cash-flow"] + allowed_timescales = ["yearly", "quarterly"] + + if name not in allowed_names: + raise ValueError("Illegal argument: name must be one of: {}".format(allowed_names)) + if timescale not in allowed_timescales: + raise ValueError("Illegal argument: timescale must be one of: {}".format(allowed_names)) + + try: + statement = self._create_financials_table(name, timescale, proxy) + + if statement is not None: + return statement + except YFianceException as e: + print("Failed to create financials table for {} reason: {}".format(name, repr(e))) + return pd.DataFrame() + + def _create_financials_table(self, name, timescale, proxy): + if name == "income": + # Yahoo stores the 'income' table internally under 'financials' key + name = "financials" + + keys = self._get_datastore_keys(name, proxy) + + try: + # Developers note: TTM and template stuff allows for reproducing the nested structure + # visible on Yahoo website. But more work needed to make it user-friendly! Ideally + # return a tree data structure instead of Pandas MultiIndex + # So until this is implemented, just return simple tables + return self.get_financials_time_series(timescale, keys, proxy) + + except Exception as e: + pass + + def _get_datastore_keys(self, sub_page, proxy) -> list: + data_stores = self._data.get_json_data_stores(sub_page, proxy) + + # Step 1: get the keys: + def _finditem1(key, obj): + values = [] + if isinstance(obj, dict): + if key in obj.keys(): + values.append(obj[key]) + for k, v in obj.items(): + values += _finditem1(key, v) + elif isinstance(obj, list): + for v in obj: + values += _finditem1(key, v) + return values + + try: + keys = _finditem1("key", data_stores['FinancialTemplateStore']) + except KeyError as e: + raise YFianceDataException("Parsing FinancialTemplateStore failed, reason: {}".format(repr(e))) + + if not keys: + raise YFianceDataException("No keys in FinancialTemplateStore") + return keys + + def get_financials_time_series(self, timescale, keys: list, proxy=None) -> pd.DataFrame: + timescale_translation = {"yearly": "annual", "quarterly": "quarterly"} + timescale = timescale_translation[timescale] + + # Step 2: construct url: + ts_url_base = \ + "https://query2.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{0}?symbol={0}" \ + .format(self._data.ticker) + + url = ts_url_base + "&type=" + ",".join([timescale + k for k in keys]) + # Yahoo returns maximum 4 years or 5 quarters, regardless of start_dt: + start_dt = datetime.datetime(2016, 12, 31) + end = (datetime.datetime.now() + datetime.timedelta(days=366)) + url += "&period1={}&period2={}".format(int(start_dt.timestamp()), int(end.timestamp())) + + # Step 3: fetch and reshape data + json_str = self._data.cache_get(url=url, proxy=proxy).text + json_data = json.loads(json_str) + data_raw = json_data["timeseries"]["result"] + # data_raw = [v for v in data_raw if len(v) > 1] # Discard keys with no data + for d in data_raw: + del d["meta"] + + # Now reshape data into a table: + # Step 1: get columns and index: + timestamps = set() + data_unpacked = {} + for x in data_raw: + for k in x.keys(): + if k == "timestamp": + timestamps.update(x[k]) + else: + data_unpacked[k] = x[k] + timestamps = sorted(list(timestamps)) + dates = pd.to_datetime(timestamps, unit="s") + df = pd.DataFrame(columns=dates, index=list(data_unpacked.keys())) + for k, v in data_unpacked.items(): + if df is None: + df = pd.DataFrame(columns=dates, index=[k]) + df.loc[k] = {pd.Timestamp(x["asOfDate"]): x["reportedValue"]["raw"] for x in v} + + df.index = df.index.str.replace("^" + timescale, "", regex=True) + + # Reorder table to match order on Yahoo website + df = df.reindex([k for k in keys if k in df.index]) + df = df[sorted(df.columns, reverse=True)] + + return df + + def get_income_scrape(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._income_scraped + if freq not in res: + res[freq] = self._scrape("income", freq, proxy=None) + return res[freq] + + def get_balance_sheet_scrape(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._balance_sheet_scraped + if freq not in res: + res[freq] = self._scrape("balance-sheet", freq, proxy=None) + return res[freq] + + def get_cash_flow_scrape(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._cash_flow_scraped + if freq not in res: + res[freq] = self._scrape("cash-flow", freq, proxy=None) + return res[freq] + + def _scrape(self, name, timescale, proxy=None): + # Backup in case _fetch_time_series() fails to return data + + allowed_names = ["income", "balance-sheet", "cash-flow"] + allowed_timescales = ["yearly", "quarterly"] + + if name not in allowed_names: + raise ValueError("Illegal argument: name must be one of: {}".format(allowed_names)) + if timescale not in allowed_timescales: + raise ValueError("Illegal argument: timescale must be one of: {}".format(allowed_names)) + + try: + statement = self._create_financials_table_old(name, timescale, proxy) + + if statement is not None: + return statement + except YFianceException as e: + print("Failed to create financials table for {} reason: {}".format(name, repr(e))) + return pd.DataFrame() + + def _create_financials_table_old(self, name, timescale, proxy): + data_stores = self._data.get_json_data_stores("financials", proxy) + + # Fetch raw data + if not "QuoteSummaryStore" in data_stores: + return pd.DataFrame() + data = data_stores["QuoteSummaryStore"] + + if name == "cash-flow": + key1 = "cashflowStatement" + key2 = "cashflowStatements" + elif name == "balance-sheet": + key1 = "balanceSheet" + key2 = "balanceSheetStatements" + else: + key1 = "incomeStatement" + key2 = "incomeStatementHistory" + key1 += "History" + if timescale == "quarterly": + key1 += "Quarterly" + data = data.get(key1)[key2] + + # Tabulate + df = pd.DataFrame(data) + if len(df) == 0: + return pd.DataFrame() + df = df.drop(columns=['maxAge']) + for col in df.columns: + df[col] = df[col].replace('-', np.nan) + df.set_index('endDate', inplace=True) + try: + df.index = pd.to_datetime(df.index, unit='s') + except ValueError: + df.index = pd.to_datetime(df.index) + df = df.T + df.columns.name = '' + df.index.name = 'Breakdown' + # rename incorrect yahoo key + df.rename(index={'treasuryStock': 'gainsLossesNotAffectingRetainedEarnings'}, inplace=True) + + # Upper-case first letter, leave rest unchanged: + s0 = df.index[0] + df.index = [s[0].upper()+s[1:] for s in df.index] + + return df diff --git a/yfinance/scrapers/holders.py b/yfinance/scrapers/holders.py new file mode 100644 index 000000000..76faad748 --- /dev/null +++ b/yfinance/scrapers/holders.py @@ -0,0 +1,66 @@ +import pandas as pd + +from yfinance.data import TickerData + +class Holders: + _SCRAPE_URL_ = 'https://finance.yahoo.com/quote' + + def __init__(self, data: TickerData, proxy=None): + self._data = data + self.proxy = proxy + + self._major = None + self._institutional = None + self._mutualfund = None + + @property + def major(self) -> pd.DataFrame: + if self._major is None: + self._scrape(self.proxy) + return self._major + + @property + def institutional(self) -> pd.DataFrame: + if self._institutional is None: + self._scrape(self.proxy) + return self._institutional + + @property + def mutualfund(self) -> pd.DataFrame: + if self._mutualfund is None: + self._scrape(self.proxy) + return self._mutualfund + + def _scrape(self, proxy): + ticker_url = "{}/{}".format(self._SCRAPE_URL_, self._data.ticker) + try: + resp = self._data.cache_get(ticker_url + '/holders', proxy) + holders = pd.read_html(resp.text) + except Exception: + holders = [] + + if len(holders) >= 3: + self._major = holders[0] + self._institutional = holders[1] + self._mutualfund = holders[2] + elif len(holders) >= 2: + self._major = holders[0] + self._institutional = holders[1] + elif len(holders) >= 1: + self._major = holders[0] + + if self._institutional is not None: + if 'Date Reported' in self._institutional: + self._institutional['Date Reported'] = pd.to_datetime( + self._institutional['Date Reported']) + if '% Out' in self._institutional: + self._institutional['% Out'] = self._institutional[ + '% Out'].str.replace('%', '').astype(float) / 100 + + if self._mutualfund is not None: + if 'Date Reported' in self._mutualfund: + self._mutualfund['Date Reported'] = pd.to_datetime( + self._mutualfund['Date Reported']) + if '% Out' in self._mutualfund: + self._mutualfund['% Out'] = self._mutualfund[ + '% Out'].str.replace('%', '').astype(float) / 100 diff --git a/yfinance/scrapers/quote.py b/yfinance/scrapers/quote.py new file mode 100644 index 000000000..d14078435 --- /dev/null +++ b/yfinance/scrapers/quote.py @@ -0,0 +1,210 @@ +import datetime +import json + +import pandas as pd + +from yfinance import utils +from yfinance.data import TickerData + + +class Quote: + + def __init__(self, data: TickerData, proxy=None): + self._data = data + self.proxy = proxy + + self._info = None + self._sustainability = None + self._recommendations = None + self._calendar = None + + self._already_scraped = False + self._already_scraped_complementary = False + + @property + def info(self) -> dict: + if self._info is None: + self._scrape(self.proxy) + self._scrape_complementary(self.proxy) + + return self._info + + @property + def sustainability(self) -> pd.DataFrame: + if self._sustainability is None: + self._scrape(self.proxy) + return self._sustainability + + @property + def recommendations(self) -> pd.DataFrame: + if self._recommendations is None: + self._scrape(self.proxy) + return self._recommendations + + @property + def calendar(self) -> pd.DataFrame: + if self._calendar is None: + self._scrape(self.proxy) + return self._calendar + + def _scrape(self, proxy): + if self._already_scraped: + return + self._already_scraped = True + + # get info and sustainability + json_data = self._data.get_json_data_stores(proxy=proxy) + try: + quote_summary_store = json_data['QuoteSummaryStore'] + except KeyError: + err_msg = "No summary info found, symbol may be delisted" + print('- %s: %s' % (self._data.ticker, err_msg)) + return None + + # sustainability + d = {} + try: + if isinstance(quote_summary_store.get('esgScores'), dict): + for item in quote_summary_store['esgScores']: + if not isinstance(quote_summary_store['esgScores'][item], (dict, list)): + d[item] = quote_summary_store['esgScores'][item] + + s = pd.DataFrame(index=[0], data=d)[-1:].T + s.columns = ['Value'] + s.index.name = '%.f-%.f' % ( + s[s.index == 'ratingYear']['Value'].values[0], + s[s.index == 'ratingMonth']['Value'].values[0]) + + self._sustainability = s[~s.index.isin( + ['maxAge', 'ratingYear', 'ratingMonth'])] + except Exception: + pass + + self._info = {} + try: + items = ['summaryProfile', 'financialData', 'quoteType', + 'defaultKeyStatistics', 'assetProfile', 'summaryDetail'] + for item in items: + if isinstance(quote_summary_store.get(item), dict): + self._info.update(quote_summary_store[item]) + except Exception: + pass + + # For ETFs, provide this valuable data: the top holdings of the ETF + try: + if 'topHoldings' in quote_summary_store: + self._info.update(quote_summary_store['topHoldings']) + except Exception: + pass + + try: + if not isinstance(quote_summary_store.get('summaryDetail'), dict): + # For some reason summaryDetail did not give any results. The price dict + # usually has most of the same info + self._info.update(quote_summary_store.get('price', {})) + except Exception: + pass + + try: + # self._info['regularMarketPrice'] = self._info['regularMarketOpen'] + self._info['regularMarketPrice'] = quote_summary_store.get('price', {}).get( + 'regularMarketPrice', self._info.get('regularMarketOpen', None)) + except Exception: + pass + + try: + self._info['preMarketPrice'] = quote_summary_store.get('price', {}).get( + 'preMarketPrice', self._info.get('preMarketPrice', None)) + except Exception: + pass + + self._info['logo_url'] = "" + try: + if not 'website' in self._info: + self._info['logo_url'] = 'https://logo.clearbit.com/%s.com' % \ + self._info['shortName'].split(' ')[0].split(',')[0] + else: + domain = self._info['website'].split( + '://')[1].split('/')[0].replace('www.', '') + self._info['logo_url'] = 'https://logo.clearbit.com/%s' % domain + except Exception: + pass + + # events + try: + cal = pd.DataFrame(quote_summary_store['calendarEvents']['earnings']) + cal['earningsDate'] = pd.to_datetime( + cal['earningsDate'], unit='s') + self._calendar = cal.T + self._calendar.index = utils.camel2title(self._calendar.index) + self._calendar.columns = ['Value'] + except Exception as e: + pass + + # analyst recommendations + try: + rec = pd.DataFrame( + quote_summary_store['upgradeDowngradeHistory']['history']) + rec['earningsDate'] = pd.to_datetime( + rec['epochGradeDate'], unit='s') + rec.set_index('earningsDate', inplace=True) + rec.index.name = 'Date' + rec.columns = utils.camel2title(rec.columns) + self._recommendations = rec[[ + 'Firm', 'To Grade', 'From Grade', 'Action']].sort_index() + except Exception: + pass + + def _scrape_complementary(self, proxy): + if self._already_scraped_complementary: + return + self._already_scraped_complementary = True + + self._scrape(proxy) + if self._info is None: + return + + # Complementary key-statistics. For now just want 'trailing PEG ratio' + keys = {"trailingPegRatio"} + if keys: + # Simplified the original scrape code for key-statistics. Very expensive for fetching + # just one value, best if scraping most/all: + # + # p = _re.compile(r'root\.App\.main = (.*);') + # url = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'.format(self._ticker.ticker, self._ticker.ticker) + # try: + # r = session.get(url, headers=utils.user_agent_headers) + # data = _json.loads(p.findall(r.text)[0]) + # key_stats = data['context']['dispatcher']['stores']['QuoteTimeSeriesStore']["timeSeries"] + # for k in keys: + # if k not in key_stats or len(key_stats[k])==0: + # # Yahoo website prints N/A, indicates Yahoo lacks necessary data to calculate + # v = None + # else: + # # Select most recent (last) raw value in list: + # v = key_stats[k][-1]["reportedValue"]["raw"] + # self._info[k] = v + # except Exception: + # raise + # pass + # + # For just one/few variable is faster to query directly: + url = "https://query1.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{}?symbol={}".format( + self._data.ticker, self._data.ticker) + for k in keys: + url += "&type=" + k + # Request 6 months of data + url += "&period1={}".format( + int((datetime.datetime.now() - datetime.timedelta(days=365 // 2)).timestamp())) + url += "&period2={}".format(int((datetime.datetime.now() + datetime.timedelta(days=1)).timestamp())) + + json_str = self._data.cache_get(url=url, proxy=proxy).text + json_data = json.loads(json_str) + key_stats = json_data["timeseries"]["result"][0] + if k not in key_stats: + # Yahoo website prints N/A, indicates Yahoo lacks necessary data to calculate + v = None + else: + # Select most recent (last) raw value in list: + v = key_stats[k][-1]["reportedValue"]["raw"] + self._info[k] = v diff --git a/yfinance/ticker.py b/yfinance/ticker.py index 0c821ebf8..20371ca26 100644 --- a/yfinance/ticker.py +++ b/yfinance/ticker.py @@ -30,6 +30,9 @@ class Ticker(TickerBase): + def __init__(self, ticker, session=None): + super(Ticker, self).__init__(ticker, session=session) + self._expirations = {} def __repr__(self): return 'yfinance.Ticker object <%s>' % self.ticker @@ -99,39 +102,43 @@ def isin(self): return self.get_isin() @property - def major_holders(self): + def major_holders(self) -> _pd.DataFrame: return self.get_major_holders() @property - def institutional_holders(self): + def institutional_holders(self) -> _pd.DataFrame: return self.get_institutional_holders() @property - def mutualfund_holders(self): + def mutualfund_holders(self) -> _pd.DataFrame: return self.get_mutualfund_holders() @property - def dividends(self): + def dividends(self) -> _pd.Series: return self.get_dividends() @property - def splits(self): + def capital_gains(self): + return self.get_capital_gains() + + @property + def splits(self) -> _pd.Series: return self.get_splits() @property - def actions(self): + def actions(self) -> _pd.DataFrame: return self.get_actions() @property - def shares(self): + def shares(self) -> _pd.DataFrame : return self.get_shares() @property - def info(self): + def info(self) -> dict: return self.get_info() @property - def calendar(self): + def calendar(self) -> _pd.DataFrame: return self.get_calendar() @property @@ -139,63 +146,63 @@ def recommendations(self): return self.get_recommendations() @property - def earnings(self): + def earnings(self) -> _pd.DataFrame: return self.get_earnings() @property - def quarterly_earnings(self): + def quarterly_earnings(self) -> _pd.DataFrame: return self.get_earnings(freq='quarterly') @property - def income_stmt(self): - return self.get_income_stmt() + def income_stmt(self) -> _pd.DataFrame: + return self.get_income_stmt(pretty=True) @property - def quarterly_income_stmt(self): - return self.get_income_stmt(freq='quarterly') + def quarterly_income_stmt(self) -> _pd.DataFrame: + return self.get_income_stmt(pretty=True, freq='quarterly') @property - def balance_sheet(self): - return self.get_balance_sheet() + def balance_sheet(self) -> _pd.DataFrame: + return self.get_balance_sheet(pretty=True) @property - def quarterly_balance_sheet(self): - return self.get_balance_sheet(freq='quarterly') + def quarterly_balance_sheet(self) -> _pd.DataFrame: + return self.get_balance_sheet(pretty=True, freq='quarterly') @property - def balancesheet(self): + def balancesheet(self) -> _pd.DataFrame: return self.balance_sheet @property - def quarterly_balancesheet(self): + def quarterly_balancesheet(self) -> _pd.DataFrame: return self.quarterly_balance_sheet @property - def cashflow(self): - return self.get_cashflow(freq="yearly") + def cashflow(self) -> _pd.DataFrame: + return self.get_cashflow(pretty=True, freq="yearly") @property - def quarterly_cashflow(self): - return self.get_cashflow(freq='quarterly') + def quarterly_cashflow(self) -> _pd.DataFrame: + return self.get_cashflow(pretty=True, freq='quarterly') @property def recommendations_summary(self): return self.get_recommendations_summary() @property - def analyst_price_target(self): + def analyst_price_target(self) -> _pd.DataFrame: return self.get_analyst_price_target() @property - def revenue_forecasts(self): + def revenue_forecasts(self) -> _pd.DataFrame: return self.get_rev_forecast() @property - def sustainability(self): + def sustainability(self) -> _pd.DataFrame: return self.get_sustainability() @property - def options(self): + def options(self) -> tuple: if not self._expirations: self._download_options() return tuple(self._expirations.keys()) @@ -205,17 +212,17 @@ def news(self): return self.get_news() @property - def earnings_trend(self): + def earnings_trend(self) -> _pd.DataFrame: return self.get_earnings_trend() @property - def earnings_history(self): - return self.get_earnings_history() - - @property - def earnings_dates(self): + def earnings_dates(self) -> _pd.DataFrame: return self.get_earnings_dates() @property - def earnings_forecasts(self): + def earnings_forecasts(self) -> _pd.DataFrame: return self.get_earnings_forecast() + + @property + def history_metadata(self) -> dict: + return self.get_history_metadata() diff --git a/yfinance/utils.py b/yfinance/utils.py index 689238f6b..8ea821f52 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -22,7 +22,8 @@ from __future__ import print_function import datetime as _datetime -from typing import Dict, Union +import dateutil as _dateutil +from typing import Dict, Union, List, Optional import pytz as _tz import requests as _requests @@ -216,7 +217,7 @@ def format_annual_financial_statement(level_detail, annual_dicts, annual_order, else: _statement = Annual - _statement.index = camel2title(_statement.T) + _statement.index = camel2title(_statement.T.index) _statement['level_detail'] = level_detail _statement = _statement.set_index([_statement.index, 'level_detail']) _statement = _statement[sorted(_statement.columns, reverse=True)] @@ -241,8 +242,46 @@ def format_quarterly_financial_statement(_statement, level_detail, order): return _statement -def camel2title(o): - return [_re.sub("([a-z])([A-Z])", r"\g<1> \g<2>", i).title() for i in o] +def camel2title(strings: List[str], sep: str = ' ', acronyms: Optional[List[str]] = None) -> List[str]: + if isinstance(strings, str) or not hasattr(strings, '__iter__') or not isinstance(strings[0], str): + raise TypeError("camel2title() 'strings' argument must be iterable of strings") + if not isinstance(sep, str) or len(sep) != 1: + raise ValueError(f"camel2title() 'sep' argument = '{sep}' must be single character") + if _re.match("[a-zA-Z0-9]", sep): + raise ValueError(f"camel2title() 'sep' argument = '{sep}' cannot be alpha-numeric") + if _re.escape(sep) != sep and sep not in {' ', '-'}: + # Permit some exceptions, I don't understand why they get escaped + raise ValueError(f"camel2title() 'sep' argument = '{sep}' cannot be special character") + + if acronyms is None: + pat = "([a-z])([A-Z])" + rep = rf"\g<1>{sep}\g<2>" + return [_re.sub(pat, rep, s).title() for s in strings] + + # Handling acronyms requires more care. Assumes Yahoo returns acronym strings upper-case + if isinstance(acronyms, str) or not hasattr(acronyms, '__iter__') or not isinstance(acronyms[0], str): + raise TypeError("camel2title() 'acronyms' argument must be iterable of strings") + for a in acronyms: + if not _re.match("^[A-Z]+$", a): + raise ValueError(f"camel2title() 'acronyms' argument must only contain upper-case, but '{a}' detected") + + # Insert 'sep' between lower-then-upper-case + pat = "([a-z])([A-Z])" + rep = rf"\g<1>{sep}\g<2>" + strings = [_re.sub(pat, rep, s) for s in strings] + + # Insert 'sep' after acronyms + for a in acronyms: + pat = f"({a})([A-Z][a-z])" + rep = rf"\g<1>{sep}\g<2>" + strings = [_re.sub(pat, rep, s) for s in strings] + + # Apply str.title() to non-acronym words + strings = [s.split(sep) for s in strings] + strings = [[j.title() if not j in acronyms else j for j in s] for s in strings] + strings = [sep.join(s) for s in strings] + + return strings def _parse_user_dt(dt, exchange_tz): @@ -262,7 +301,17 @@ def _parse_user_dt(dt, exchange_tz): return dt +def _interval_to_timedelta(interval): + if interval == "1mo": + return _dateutil.relativedelta(months=1) + elif interval == "1wk": + return _pd.Timedelta(days=7, unit='d') + else: + return _pd.Timedelta(interval) + + def auto_adjust(data): + col_order = data.columns df = data.copy() ratio = df["Close"] / df["Adj Close"] df["Adj Open"] = df["Open"] / ratio @@ -278,13 +327,13 @@ def auto_adjust(data): "Adj Low": "Low", "Adj Close": "Close" }, inplace=True) - df = df[["Open", "High", "Low", "Close", "Volume"]] - return df[["Open", "High", "Low", "Close", "Volume"]] + return df[[c for c in col_order if c in df.columns]] def back_adjust(data): """ back-adjusted data to mimic true historical prices """ + col_order = data.columns df = data.copy() ratio = df["Adj Close"] / df["Close"] df["Adj Open"] = df["Open"] * ratio @@ -300,7 +349,7 @@ def back_adjust(data): "Adj Low": "Low" }, inplace=True) - return df[["Open", "High", "Low", "Close", "Volume"]] + return df[[c for c in col_order if c in df.columns]] def parse_quotes(data): @@ -332,6 +381,8 @@ def parse_quotes(data): def parse_actions(data): dividends = _pd.DataFrame( columns=["Dividends"], index=_pd.DatetimeIndex([])) + capital_gains = _pd.DataFrame( + columns=["Capital Gains"], index=_pd.DatetimeIndex([])) splits = _pd.DataFrame( columns=["Stock Splits"], index=_pd.DatetimeIndex([])) @@ -342,9 +393,16 @@ def parse_actions(data): dividends.set_index("date", inplace=True) dividends.index = _pd.to_datetime(dividends.index, unit="s") dividends.sort_index(inplace=True) - dividends.columns = ["Dividends"] + if "capitalGains" in data["events"]: + capital_gains = _pd.DataFrame( + data=list(data["events"]["capitalGains"].values())) + capital_gains.set_index("date", inplace=True) + capital_gains.index = _pd.to_datetime(capital_gains.index, unit="s") + capital_gains.sort_index(inplace=True) + capital_gains.columns = ["Capital Gains"] + if "splits" in data["events"]: splits = _pd.DataFrame( data=list(data["events"]["splits"].values())) @@ -355,7 +413,7 @@ def parse_actions(data): splits["denominator"] splits = splits[["Stock Splits"]] - return dividends, splits + return dividends, splits, capital_gains def set_df_tz(df, interval, tz): @@ -393,7 +451,7 @@ def fix_Yahoo_returning_live_separate(quotes, interval, tz_exchange): elif interval == "3mo": last_rows_same_interval = dt1.year == dt2.year and dt1.quarter == dt2.quarter else: - last_rows_same_interval = False + last_rows_same_interval = (dt1-dt2) < _pd.Timedelta(interval) if last_rows_same_interval: # Last two rows are within same interval @@ -472,7 +530,7 @@ def _reindex_events(df, new_index, data_col_name): new_index = None if new_index is not None: - new_index = new_index.tz_localize(df.index.tz, ambiguous=True) + new_index = new_index.tz_localize(df.index.tz, ambiguous=True, nonexistent='shift_forward') df_sub = _reindex_events(df_sub, new_index, data_col) df = df_main.join(df_sub) @@ -542,13 +600,15 @@ def _reindex_events(df, new_index, data_col_name): ## Not always possible to match events with trading, e.g. when released pre-market. ## So have to append to bottom with nan prices. ## But should only be impossible with intra-day price data. - if interval.endswith('m') or interval.endswith('h'): + if interval.endswith('m') or interval.endswith('h') or interval == "1d": + # Update: is possible with daily data when dividend very recent f_missing = ~df_sub.index.isin(df.index) df_sub_missing = df_sub[f_missing] keys = {"Adj Open", "Open", "Adj High", "High", "Adj Low", "Low", "Adj Close", "Close"}.intersection(df.columns) df_sub_missing[list(keys)] = _np.nan - df = _pd.concat([df, df_sub_missing], sort=True) + col_ordering = df.columns + df = _pd.concat([df, df_sub_missing], sort=True)[col_ordering] else: raise Exception("Lost data during merge despite all attempts to align data (see above)") @@ -721,12 +781,17 @@ def tz_db(self): def _migrate_cache_tkr_tz(self): """Migrate contents from old ticker CSV-cache to SQLite db""" - fp = _os.path.join(self._db_dir, "tkr-tz.csv") - if not _os.path.isfile(fp): + old_cache_file_path = _os.path.join(self._db_dir, "tkr-tz.csv") + + if not _os.path.isfile(old_cache_file_path): return None - df = _pd.read_csv(fp, index_col="Ticker") - self.tz_db.bulk_set(df.to_dict()['Tz']) - _os.remove(fp) + try: + df = _pd.read_csv(old_cache_file_path, index_col="Ticker") + except _pd.errors.EmptyDataError: + _os.remove(old_cache_file_path) + else: + self.tz_db.bulk_set(df.to_dict()['Tz']) + _os.remove(old_cache_file_path) class _TzCacheDummy: