From 1ed4b4b65de0a182f65761d242b6f4154c47cbdf Mon Sep 17 00:00:00 2001 From: Thirumalai Raj R Date: Fri, 11 Nov 2022 20:57:39 +0530 Subject: [PATCH 01/41] For ETFs & Mutual Funds, add capitalGains --- README.md | 6 +++++- yfinance/base.py | 30 +++++++++++++++++++++++++----- yfinance/ticker.py | 4 ++++ yfinance/utils.py | 12 +++++++++++- 4 files changed, 45 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f3ec7c153..a62990230 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ msft.info # get historical market data hist = msft.history(period="max") -# show actions (dividends, splits) +# show actions (dividends, splits, capital gains) msft.actions # show dividends @@ -68,6 +68,9 @@ msft.dividends # show splits msft.splits +# show capital gains (for mutual funds & etfs) +msft.capital_gains + # show financials msft.financials msft.quarterly_financials @@ -128,6 +131,7 @@ msft.history(..., proxy="PROXY_SERVER") msft.get_actions(proxy="PROXY_SERVER") msft.get_dividends(proxy="PROXY_SERVER") msft.get_splits(proxy="PROXY_SERVER") +msft.get_capital_gains(proxy="PROXY_SERVER") msft.get_balance_sheet(proxy="PROXY_SERVER") msft.get_cashflow(proxy="PROXY_SERVER") msft.option_chain(..., proxy="PROXY_SERVER") diff --git a/yfinance/base.py b/yfinance/base.py index 682131904..cc1393593 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -190,7 +190,7 @@ def history(self, period="1mo", interval="1d", params["interval"] = interval.lower() params["includePrePost"] = prepost - params["events"] = "div,splits" + params["events"] = "div,splits,capitalGains" # 1) fix weired bug with Yahoo! - returning 60m for 30m bars if params["interval"] == "30m": @@ -324,23 +324,29 @@ def history(self, period="1mo", interval="1d", quotes['Volume'] = quotes['Volume'].fillna(0).astype(_np.int64) # actions - dividends, splits = utils.parse_actions(data["chart"]["result"][0]) + dividends, splits, capital_gains = utils.parse_actions(data["chart"]["result"][0]) if start is not None: startDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(start)) if dividends is not None: dividends = dividends[dividends.index>=startDt] + if capital_gains is not None: + capital_gains = capital_gains[capital_gains.index>=startDt] if splits is not None: splits = splits[splits.index>=startDt] if end is not None: endDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(end)) if dividends is not None: dividends = dividends[dividends.index 0: + df = utils.safe_merge_dfs(df, capital_gains, interval) + if "Capital Gains" in df.columns: + df.loc[df["Capital Gains"].isna(),"Capital Gains"] = 0 + else: + df["Capital Gains"] = 0.0 if params["interval"][-1] in ("m",'h'): df.index.name = "Datetime" @@ -366,7 +378,7 @@ def history(self, period="1mo", interval="1d", df = df[~df.index.duplicated(keep='first')] self._history = df.copy() if not actions: - df = df.drop(columns=["Dividends", "Stock Splits"]) + df = df.drop(columns=["Dividends", "Stock Splits", "Capital Gains"]) if not keepna: mask_nan_or_zero = (df.isna()|(df==0)).all(axis=1) df = df.drop(mask_nan_or_zero.index[mask_nan_or_zero]) @@ -1008,6 +1020,14 @@ def get_dividends(self, proxy=None): return dividends[dividends != 0] return [] + def get_capital_gains(self, proxy=None): + if self._history is None: + self.history(period="max", proxy=proxy) + if self._history is not None and "Capital Gains" in self._history: + capital_gains = self._history["Capital Gains"] + return capital_gains[capital_gains != 0] + return [] + def get_splits(self, proxy=None): if self._history is None: self.history(period="max", proxy=proxy) @@ -1019,8 +1039,8 @@ def get_splits(self, proxy=None): def get_actions(self, proxy=None): if self._history is None: self.history(period="max", proxy=proxy) - if self._history is not None and "Dividends" in self._history and "Stock Splits" in self._history: - actions = self._history[["Dividends", "Stock Splits"]] + if self._history is not None and "Dividends" in self._history and "Stock Splits" in self._history and "Capital Gains" in self._history: + actions = self._history[["Dividends", "Stock Splits", "Capital Gains"]] return actions[actions != 0].dropna(how='all').fillna(0) return [] diff --git a/yfinance/ticker.py b/yfinance/ticker.py index cdd3d1471..7f41fbb8c 100644 --- a/yfinance/ticker.py +++ b/yfinance/ticker.py @@ -130,6 +130,10 @@ def mutualfund_holders(self): def dividends(self): return self.get_dividends() + @property + def capital_gains(self): + return self.get_capital_gains() + @property def splits(self): return self.get_splits() diff --git a/yfinance/utils.py b/yfinance/utils.py index 826ae6540..2597c8bc7 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -236,6 +236,8 @@ def parse_quotes(data): def parse_actions(data): dividends = _pd.DataFrame( columns=["Dividends"], index=_pd.DatetimeIndex([])) + capital_gains = _pd.DataFrame( + columns=["Capital Gains"], index=_pd.DatetimeIndex([])) splits = _pd.DataFrame( columns=["Stock Splits"], index=_pd.DatetimeIndex([])) @@ -249,6 +251,14 @@ def parse_actions(data): dividends.columns = ["Dividends"] + if "capitalGains" in data["events"]: + capital_gains = _pd.DataFrame( + data=list(data["events"]["capitalGains"].values())) + capital_gains.set_index("date", inplace=True) + capital_gains.index = _pd.to_datetime(capital_gains.index, unit="s") + capital_gains.sort_index(inplace=True) + capital_gains.columns = ["Capital Gains"] + if "splits" in data["events"]: splits = _pd.DataFrame( data=list(data["events"]["splits"].values())) @@ -259,7 +269,7 @@ def parse_actions(data): splits["denominator"] splits = splits[["Stock Splits"]] - return dividends, splits + return dividends, splits, capital_gains def set_df_tz(df, interval, tz): From 9ba3d5a1ea5455567766ed3592283b1373fee11d Mon Sep 17 00:00:00 2001 From: ppeloton Date: Sun, 13 Nov 2022 08:44:43 +0200 Subject: [PATCH 02/41] Fixing issue 980 by changing default timestamp for start parameter in base.py --- yfinance/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yfinance/base.py b/yfinance/base.py index 3a525edc3..6b03b5508 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -164,7 +164,8 @@ def history(self, period="1mo", interval="1d", if interval == "1m": start = end - 604800 # Subtract 7 days else: - start = -631159200 + #time stamp of 01/01/1900 + start = -2208994789 else: start = utils._parse_user_dt(start, tz) params = {"period1": start, "period2": end} From 724118a671ad8a083ecaeb0bb7ed937fa87f0dca Mon Sep 17 00:00:00 2001 From: Fredrik Corneliusson Date: Tue, 8 Nov 2022 17:39:48 +0100 Subject: [PATCH 03/41] Major refactoring Made fundamentals, quote, analysis and holders into lazy scraped modules in order to improve performance and code maintainability. --- tests/prices.py | 27 +- tests/ticker.py | 2 +- yfinance/base.py | 515 ++++-------------------------- yfinance/data.py | 6 +- yfinance/scrapers/__init__.py | 0 yfinance/scrapers/analysis.py | 121 +++++++ yfinance/scrapers/fundamentals.py | 150 +++++++++ yfinance/scrapers/holders.py | 66 ++++ yfinance/scrapers/quote.py | 214 +++++++++++++ yfinance/ticker.py | 3 + 10 files changed, 635 insertions(+), 469 deletions(-) create mode 100644 yfinance/scrapers/__init__.py create mode 100644 yfinance/scrapers/analysis.py create mode 100644 yfinance/scrapers/fundamentals.py create mode 100644 yfinance/scrapers/holders.py create mode 100644 yfinance/scrapers/quote.py diff --git a/tests/prices.py b/tests/prices.py index a0601a652..d5ef0bcbc 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -6,22 +6,22 @@ import pytz as _tz import numpy as _np import pandas as _pd -import os -# Create temp session -import requests_cache, tempfile +import requests_cache -td = tempfile.TemporaryDirectory() class TestPriceHistory(unittest.TestCase): - def setUp(self): - global td - self.td = td - self.session = requests_cache.CachedSession(os.path.join(self.td.name, "yfinance.cache")) + session = None - def tearDown(self): - self.session.close() + @classmethod + def setUpClass(cls): + cls.session = requests_cache.CachedSession() + + @classmethod + def tearDownClass(cls): + if cls.session is not None: + cls.session.close() def test_daily_index(self): tkrs = ["BHP.AX", "IMP.JO", "BP.L", "PNL.L", "INTC"] @@ -433,11 +433,8 @@ def test_repair_daily_zeroes(self): for c in ["Open", "Low", "High", "Close"]: self.assertTrue(_np.isclose(repaired_df[c], correct_df[c], rtol=1e-8).all()) -try: - if __name__ == '__main__': - unittest.main() -finally: - td.cleanup() +if __name__ == '__main__': + unittest.main() # # Run tests sequentially: # import inspect diff --git a/tests/ticker.py b/tests/ticker.py index 9f9a6207a..5a67a2a39 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -16,7 +16,7 @@ import requests_cache # Set this to see the exact requests that are made during tests -DEBUG_LOG_REQUESTS = True +DEBUG_LOG_REQUESTS = False if DEBUG_LOG_REQUESTS: import logging diff --git a/yfinance/base.py b/yfinance/base.py index 6b03b5508..7a4e1ced5 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -32,9 +32,11 @@ from . import utils -import json as _json - from . import shared +from .scrapers.analysis import Analysis +from .scrapers.fundamentals import Fundamentals +from .scrapers.holders import Holders +from .scrapers.quote import Quote _BASE_URL_ = 'https://query2.finance.yahoo.com' _SCRAPE_URL_ = 'https://finance.yahoo.com/quote' @@ -50,25 +52,10 @@ def __init__(self, ticker, session=None): self._scrape_url = _SCRAPE_URL_ self._tz = None - self._fundamentals = False - self._info = None - self._earnings_trend = None - self._sustainability = None - self._recommendations = None - self._analyst_trend_details = None - self._analyst_price_target = None - self._rev_est = None - self._eps_est = None - - self._major_holders = None - self._institutional_holders = None - self._mutualfund_holders = None self._isin = None self._news = [] self._shares = None - self._calendar = None - self._expirations = {} self._earnings_dates = None self._earnings_history = None @@ -79,13 +66,14 @@ def __init__(self, ticker, session=None): if utils.is_isin(self.ticker): self.ticker = utils.get_ticker_by_isin(self.ticker, None, session) - self._data = TickerData(self.ticker, session=session) - - def stats(self, proxy=None): + self._data: TickerData = TickerData(self.ticker, session=session) - if self._fundamentals: - return + self._analysis = Analysis(self._data) + self._holders = Holders(self._data) + self._quote = Quote(self._data) + self._fundamentals = Fundamentals(self._data) + def stats(self, proxy=None): ticker_url = "{}/{}".format(self._scrape_url, self.ticker) # get info and sustainability @@ -379,7 +367,7 @@ def _reconstruct_interval(self, df_row, interval, bad_fields): data_cols = [c for c in ["Open","High","Low","Close","Adj Close"] if c in df_row.index] - # If interval is weekly then can construct with daily. But if smaller intervals then + # If interval is weekly then can construct with daily. But if smaller intervals then # restricted to recent times: # - daily = hourly restricted to last 730 days sub_interval = None @@ -642,477 +630,102 @@ def _fetch_ticker_tz(self, debug_mode, proxy, timeout): print("-------------") return None - def _get_info(self, proxy=None): - if (self._info is not None) or (self._sustainability is not None) or self._recommendations: - # No need to fetch - return - - ticker_url = "{}/{}".format(self._scrape_url, self.ticker) - - # get info and sustainability - json_data = self._data.get_json_data_stores(ticker_url, proxy) - if 'QuoteSummaryStore' not in json_data: - err_msg = "No summary info found, symbol may be delisted" - print('- %s: %s' % (self.ticker, err_msg)) - return None - data = json_data['QuoteSummaryStore'] - - # sustainability - d = {} - try: - if isinstance(data.get('esgScores'), dict): - for item in data['esgScores']: - if not isinstance(data['esgScores'][item], (dict, list)): - d[item] = data['esgScores'][item] - - s = _pd.DataFrame(index=[0], data=d)[-1:].T - s.columns = ['Value'] - s.index.name = '%.f-%.f' % ( - s[s.index == 'ratingYear']['Value'].values[0], - s[s.index == 'ratingMonth']['Value'].values[0]) - - self._sustainability = s[~s.index.isin( - ['maxAge', 'ratingYear', 'ratingMonth'])] - except Exception: - pass - - # info (be nice to python 2) - self._info = {} - try: - items = ['summaryProfile', 'financialData', 'quoteType', - 'defaultKeyStatistics', 'assetProfile', 'summaryDetail'] - for item in items: - if isinstance(data.get(item), dict): - self._info.update(data[item]) - except Exception: - pass - - # For ETFs, provide this valuable data: the top holdings of the ETF - try: - if 'topHoldings' in data: - self._info.update(data['topHoldings']) - except Exception: - pass - - try: - if not isinstance(data.get('summaryDetail'), dict): - # For some reason summaryDetail did not give any results. The price dict - # usually has most of the same info - self._info.update(data.get('price', {})) - except Exception: - pass - - try: - # self._info['regularMarketPrice'] = self._info['regularMarketOpen'] - self._info['regularMarketPrice'] = data.get('price', {}).get( - 'regularMarketPrice', self._info.get('regularMarketOpen', None)) - except Exception: - pass - - try: - self._info['preMarketPrice'] = data.get('price', {}).get( - 'preMarketPrice', self._info.get('preMarketPrice', None)) - except Exception: - pass - - self._info['logo_url'] = "" - try: - if not 'website' in self._info: - self._info['logo_url'] = 'https://logo.clearbit.com/%s.com' % \ - self._info['shortName'].split(' ')[0].split(',')[0] - else: - domain = self._info['website'].split( - '://')[1].split('/')[0].replace('www.', '') - self._info['logo_url'] = 'https://logo.clearbit.com/%s' % domain - except Exception: - pass - - # events - try: - cal = _pd.DataFrame( - data['calendarEvents']['earnings']) - cal['earningsDate'] = _pd.to_datetime( - cal['earningsDate'], unit='s') - self._calendar = cal.T - self._calendar.index = utils.camel2title(self._calendar.index) - self._calendar.columns = ['Value'] - except Exception: - pass - - # analyst recommendations - try: - rec = _pd.DataFrame( - data['upgradeDowngradeHistory']['history']) - rec['earningsDate'] = _pd.to_datetime( - rec['epochGradeDate'], unit='s') - rec.set_index('earningsDate', inplace=True) - rec.index.name = 'Date' - rec.columns = utils.camel2title(rec.columns) - self._recommendations = rec[[ - 'Firm', 'To Grade', 'From Grade', 'Action']].sort_index() - except Exception: - pass - - # Complementary key-statistics. For now just want 'trailing PEG ratio' - keys = {"trailingPegRatio"} - if len(keys) > 0: - # Simplified the original scrape code for key-statistics. Very expensive for fetching - # just one value, best if scraping most/all: - # - # p = _re.compile(r'root\.App\.main = (.*);') - # url = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'.format(self.ticker, self.ticker) - # try: - # r = session.get(url, headers=utils.user_agent_headers) - # data = _json.loads(p.findall(r.text)[0]) - # key_stats = data['context']['dispatcher']['stores']['QuoteTimeSeriesStore']["timeSeries"] - # for k in keys: - # if k not in key_stats or len(key_stats[k])==0: - # # Yahoo website prints N/A, indicates Yahoo lacks necessary data to calculate - # v = None - # else: - # # Select most recent (last) raw value in list: - # v = key_stats[k][-1]["reportedValue"]["raw"] - # self._info[k] = v - # except Exception: - # raise - # pass - # - # For just one/few variable is faster to query directly: - url = "https://query1.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{}?symbol={}".format( - self.ticker, self.ticker) - for k in keys: - url += "&type=" + k - # Request 6 months of data - url += "&period1={}".format( - int((_datetime.datetime.now() - _datetime.timedelta(days=365 // 2)).timestamp())) - url += "&period2={}".format(int((_datetime.datetime.now() + _datetime.timedelta(days=1)).timestamp())) - - json_str = self._data.get(url=url, proxy=proxy).text - json_data = _json.loads(json_str) - key_stats = json_data["timeseries"]["result"][0] - if k not in key_stats: - # Yahoo website prints N/A, indicates Yahoo lacks necessary data to calculate - v = None - else: - # Select most recent (last) raw value in list: - v = key_stats[k][-1]["reportedValue"]["raw"] - self._info[k] = v - - def _get_fundamentals(self, proxy=None): - def cleanup(data): - ''' - The cleanup function is used for parsing yahoo finance json financial statement data into a pandas dataframe format. - ''' - df = _pd.DataFrame(data).drop(columns=['maxAge']) - for col in df.columns: - df[col] = _np.where( - df[col].astype(str) == '-', _np.nan, df[col]) - - df.set_index('endDate', inplace=True) - try: - df.index = _pd.to_datetime(df.index, unit='s') - except ValueError: - df.index = _pd.to_datetime(df.index) - df = df.T - df.columns.name = '' - df.index.name = 'Breakdown' - - # rename incorrect yahoo key - df.rename(index={'treasuryStock': 'Gains Losses Not Affecting Retained Earnings'}, inplace=True) - - df.index = utils.camel2title(df.index) - return df - - if self._fundamentals: - return - - ticker_url = "{}/{}".format(self._scrape_url, self.ticker) - - # holders - try: - resp = self._data.get(ticker_url + '/holders', proxy) - holders = _pd.read_html(resp.text) - except Exception: - holders = [] - - if len(holders) >= 3: - self._major_holders = holders[0] - self._institutional_holders = holders[1] - self._mutualfund_holders = holders[2] - elif len(holders) >= 2: - self._major_holders = holders[0] - self._institutional_holders = holders[1] - elif len(holders) >= 1: - self._major_holders = holders[0] - - if self._institutional_holders is not None: - if 'Date Reported' in self._institutional_holders: - self._institutional_holders['Date Reported'] = _pd.to_datetime( - self._institutional_holders['Date Reported']) - if '% Out' in self._institutional_holders: - self._institutional_holders['% Out'] = self._institutional_holders[ - '% Out'].str.replace('%', '').astype(float) / 100 - - if self._mutualfund_holders is not None: - if 'Date Reported' in self._mutualfund_holders: - self._mutualfund_holders['Date Reported'] = _pd.to_datetime( - self._mutualfund_holders['Date Reported']) - if '% Out' in self._mutualfund_holders: - self._mutualfund_holders['% Out'] = self._mutualfund_holders[ - '% Out'].str.replace('%', '').astype(float) / 100 - - self._get_info(proxy) - - # get fundamentals - self._earnings = {"yearly": utils._pd.DataFrame(), "quarterly": utils._pd.DataFrame()} - self._financials = {} - for name in ["income", "balance-sheet", "cash-flow"]: - self._financials[name] = {"yearly": utils._pd.DataFrame(), "quarterly": utils._pd.DataFrame()} - - financials_data = self._data.get_json_data_stores(ticker_url + '/financials', proxy) - if not "QuoteSummaryStore" in financials_data: - err_msg = "No financials data found, symbol may be delisted" - print('- %s: %s' % (self.ticker, err_msg)) - return None - fin_data_quote = financials_data['QuoteSummaryStore'] - - # generic patterns - for name in ["income", "balance-sheet", "cash-flow"]: - annual, qtr = self._create_financials_table(name, proxy) - if annual is not None: - self._financials[name]["yearly"] = annual - if qtr is not None: - self._financials[name]["quarterly"] = qtr - - # earnings - if isinstance(fin_data_quote.get('earnings'), dict): - try: - earnings = fin_data_quote['earnings']['financialsChart'] - earnings['financialCurrency'] = fin_data_quote['earnings'].get('financialCurrency', 'USD') - self._earnings['financialCurrency'] = earnings['financialCurrency'] - df = _pd.DataFrame(earnings['yearly']).set_index('date') - df.columns = utils.camel2title(df.columns) - df.index.name = 'Year' - self._earnings['yearly'] = df - - df = _pd.DataFrame(earnings['quarterly']).set_index('date') - df.columns = utils.camel2title(df.columns) - df.index.name = 'Quarter' - self._earnings['quarterly'] = df - except Exception: - pass - - # shares outstanding - try: - # keep only years with non None data - available_shares = [shares_data for shares_data in - financials_data['QuoteTimeSeriesStore']['timeSeries']['annualBasicAverageShares'] if - shares_data] - shares = _pd.DataFrame(available_shares) - shares['Year'] = shares['asOfDate'].agg(lambda x: int(x[:4])) - shares.set_index('Year', inplace=True) - shares.drop(columns=['dataId', 'asOfDate', - 'periodType', 'currencyCode'], inplace=True) - shares.rename( - columns={'reportedValue': "BasicShares"}, inplace=True) - self._shares = shares - except Exception: - pass - - # Analysis - data = self._data.get_json_data_stores(ticker_url + '/analysis', proxy)["QuoteSummaryStore"] - - if isinstance(data.get('earningsTrend'), dict): - try: - analysis = _pd.DataFrame(data['earningsTrend']['trend']) - analysis['endDate'] = _pd.to_datetime(analysis['endDate']) - analysis.set_index('period', inplace=True) - analysis.index = analysis.index.str.upper() - analysis.index.name = 'Period' - analysis.columns = utils.camel2title(analysis.columns) - - dict_cols = [] - - for idx, row in analysis.iterrows(): - for colname, colval in row.items(): - if isinstance(colval, dict): - dict_cols.append(colname) - for k, v in colval.items(): - new_colname = colname + ' ' + \ - utils.camel2title([k])[0] - analysis.loc[idx, new_colname] = v - - self._earnings_trend = analysis[[ - c for c in analysis.columns if c not in dict_cols]] - except Exception: - pass - - # Analysis Data/Analyst Forecasts - try: - analysis_data = self._data.get_json_data_stores(ticker_url + '/analysis', proxy) - analysis_data = analysis_data['QuoteSummaryStore'] - except Exception as e: - analysis_data = {} - try: - self._analyst_trend_details = _pd.DataFrame(analysis_data['recommendationTrend']['trend']) - except Exception as e: - self._analyst_trend_details = None - try: - self._analyst_price_target = _pd.DataFrame(analysis_data['financialData'], index=[0])[ - ['targetLowPrice', 'currentPrice', 'targetMeanPrice', 'targetHighPrice', 'numberOfAnalystOpinions']].T - except Exception as e: - self._analyst_price_target = None - earnings_estimate = [] - revenue_estimate = [] - if len(self._analyst_trend_details) != 0: - for key in analysis_data['earningsTrend']['trend']: - try: - earnings_dict = key['earningsEstimate'] - earnings_dict['period'] = key['period'] - earnings_dict['endDate'] = key['endDate'] - earnings_estimate.append(earnings_dict) - - revenue_dict = key['revenueEstimate'] - revenue_dict['period'] = key['period'] - revenue_dict['endDate'] = key['endDate'] - revenue_estimate.append(revenue_dict) - except Exception as e: - pass - self._rev_est = _pd.DataFrame(revenue_estimate) - self._eps_est = _pd.DataFrame(earnings_estimate) - else: - self._rev_est = _pd.DataFrame() - self._eps_est = _pd.DataFrame() - - self._fundamentals = True - - def _create_financials_table(self, name, proxy): - acceptable_names = ["income", "balance-sheet", "cash-flow"] - if not name in acceptable_names: - raise Exception("name '{}' must be one of: {}".format(name, acceptable_names)) - - if name == "income": - # Yahoo stores the 'income' table internally under 'financials' key - name = "financials" - - ticker_url = "{}/{}".format(self._scrape_url, self.ticker) - data_stores = self._data.get_json_data_stores(ticker_url + '/' + name, proxy) - _stmt_annual = None - _stmt_qtr = None - try: - # Developers note: TTM and template stuff allows for reproducing the nested structure - # visible on Yahoo website. But more work needed to make it user-friendly! Ideally - # return a tree data structure instead of Pandas MultiIndex - # So until this is implemented, just return simple tables - _stmt_annual = self._data.get_financials_time_series("annual", data_stores, proxy) - _stmt_qtr = self._data.get_financials_time_series("quarterly", data_stores, proxy) - - # template_ttm_order, template_annual_order, template_order, level_detail = utils.build_template(data_store["FinancialTemplateStore"]) - # TTM_dicts, Annual_dicts = utils.retreive_financial_details(data_store['QuoteTimeSeriesStore']) - # if name == "balance-sheet": - # # Note: balance sheet is the only financial statement with no ttm detail - # _stmt_annual = utils.format_annual_financial_statement(level_detail, Annual_dicts, template_annual_order) - # else: - # _stmt_annual = utils.format_annual_financial_statement(level_detail, Annual_dicts, template_annual_order, TTM_dicts, template_ttm_order) - - # Data store doesn't contain quarterly data, so retrieve using different url: - # _qtr_data = utils.get_financials_time_series(self.ticker, name, "quarterly", ticker_url, proxy, self.session) - # _stmt_qtr = utils.format_quarterly_financial_statement(_qtr_data, level_detail, template_order) - - except Exception as e: - pass - - return _stmt_annual, _stmt_qtr - def get_recommendations(self, proxy=None, as_dict=False): - self._get_info(proxy) - data = self._recommendations + self._quote.proxy = proxy + data = self._quote.recommendations if as_dict: return data.to_dict() return data def get_calendar(self, proxy=None, as_dict=False): - self._get_info(proxy) - data = self._calendar + self._quote.proxy = proxy + data = self._quote.calendar if as_dict: return data.to_dict() return data def get_major_holders(self, proxy=None, as_dict=False): - self._get_fundamentals(proxy=proxy) - data = self._major_holders + self._holders.proxy = proxy + data = self._holders.major if as_dict: return data.to_dict() return data def get_institutional_holders(self, proxy=None, as_dict=False): - self._get_fundamentals(proxy=proxy) - data = self._institutional_holders + self._holders.proxy = proxy + data = self._holders.institutional if data is not None: if as_dict: return data.to_dict() return data def get_mutualfund_holders(self, proxy=None, as_dict=False): - self._get_fundamentals(proxy=proxy) - data = self._mutualfund_holders + self._holders.proxy = proxy + data = self._holders.mutualfund if data is not None: if as_dict: return data.to_dict() return data def get_info(self, proxy=None, as_dict=False): - self._get_info(proxy) - data = self._info + self._quote.proxy = proxy + data = self._quote.info if as_dict: return data.to_dict() return data def get_sustainability(self, proxy=None, as_dict=False): - self._get_info(proxy) - data = self._sustainability + self._quote.proxy = proxy + data = self._quote.sustainability if as_dict: return data.to_dict() return data - def get_recommendations_summary(self, proxy=None, as_dict=False, *args, **kwargs): - self._get_fundamentals(proxy=proxy) - data = self._analyst_trend_details + def get_recommendations_summary(self, proxy=None, as_dict=False): + self._quote.proxy = proxy + data = self._quote.recommendations if as_dict: return data.to_dict() return data - def get_analyst_price_target(self, proxy=None, as_dict=False, *args, **kwargs): - self._get_fundamentals(proxy=proxy) - data = self._analyst_price_target + def get_analyst_price_target(self, proxy=None, as_dict=False): + self._analysis.proxy = proxy + data = self._analysis.analyst_price_target if as_dict: return data.to_dict() return data - def get_rev_forecast(self, proxy=None, as_dict=False, *args, **kwargs): - self._get_fundamentals(proxy=proxy) - data = self._rev_est + def get_rev_forecast(self, proxy=None, as_dict=False): + self._analysis.proxy = proxy + data = self._analysis.rev_est if as_dict: return data.to_dict() return data def get_earnings_forecast(self, proxy=None, as_dict=False): - self._get_fundamentals(proxy=proxy) - data = self._eps_est + self._analysis.proxy = proxy + data = self._analysis.eps_est + if as_dict: + return data.to_dict() + return data + + def get_trend_details(self, proxy=None, as_dict=False): + self._analysis.proxy = proxy + data = self._analysis.analyst_trend_details if as_dict: return data.to_dict() return data - def get_earnings_trend(self, proxy=None, as_dict=False, *args, **kwargs): - self._get_fundamentals(proxy=proxy) - data = self._earnings_trend + def get_earnings_trend(self, proxy=None, as_dict=False): + self._analysis.proxy = proxy + data = self._analysis.earnings_trend if as_dict: return data.to_dict() return data def get_earnings(self, proxy=None, as_dict=False, freq="yearly"): - self._get_fundamentals(proxy=proxy) - data = self._earnings[freq] + self._fundamentals.proxy = proxy + data = self._fundamentals.earnings[freq] if as_dict: dict_data = data.to_dict() dict_data['financialCurrency'] = 'USD' if 'financialCurrency' not in self._earnings else self._earnings[ @@ -1121,22 +734,22 @@ def get_earnings(self, proxy=None, as_dict=False, freq="yearly"): return data def get_income_stmt(self, proxy=None, as_dict=False, freq="yearly"): - self._get_fundamentals(proxy=proxy) - data = self._financials["income"][freq] + self._fundamentals.proxy = proxy + data = self._fundamentals.financials["income"][freq] if as_dict: return data.to_dict() return data def get_balance_sheet(self, proxy=None, as_dict=False, freq="yearly"): - self._get_fundamentals(proxy=proxy) - data = self._financials["balance-sheet"][freq] + self._fundamentals.proxy = proxy + data = self._fundamentals.financials["balance-sheet"][freq] if as_dict: return data.to_dict() return data def get_cashflow(self, proxy=None, as_dict=False, freq="yearly"): - self._get_fundamentals(proxy=proxy) - data = self._financials["cash-flow"][freq] + self._fundamentals.proxy = proxy + data = self._fundamentals.financials["cash-flow"][freq] if as_dict: return data.to_dict() return data @@ -1166,8 +779,8 @@ def get_actions(self, proxy=None): return [] def get_shares(self, proxy=None, as_dict=False): - self._get_fundamentals(proxy=proxy) - data = self._shares + self._fundamentals.proxy = proxy + data = self._fundamentals.shares if as_dict: return data.to_dict() return data @@ -1184,12 +797,13 @@ def get_isin(self, proxy=None): return self._isin q = ticker - self.get_info(proxy=proxy) - if self._info is None: - # Don't print error message cause _get_info() will print one + + self._quote.proxy = proxy + if self._quote.info is None: + # Don't print error message cause self._quote.info will print one return None - if "shortName" in self._info: - q = self._info['shortName'] + if "shortName" in self._quote.info: + q = self._quote.info['shortName'] url = 'https://markets.businessinsider.com/ajax/' \ 'SearchController_Suggest?max_results=25&query=%s' \ @@ -1261,7 +875,7 @@ def get_earnings_dates(self, proxy=None): dates = _pd.concat([dates, data], axis=0) page_offset += page_size - if dates is None or dates.shape[0]==0: + if dates is None or dates.shape[0] == 0: err_msg = "No earnings dates found, symbol may be delisted" print('- %s: %s' % (self.ticker, err_msg)) return None @@ -1290,8 +904,9 @@ def get_earnings_dates(self, proxy=None): dates[cn] = dates[cn] + ' ' + tzinfo["AM/PM"] dates[cn] = _pd.to_datetime(dates[cn], format="%b %d, %Y, %I %p") # - instead of attempting decoding of ambiguous timezone abbreviation, just use 'info': + self._quote.proxy = proxy dates[cn] = dates[cn].dt.tz_localize( - tz=self.get_info()["exchangeTimezoneName"]) + tz=self._quote.info["exchangeTimezoneName"]) dates = dates.set_index("Earnings Date") diff --git a/yfinance/data.py b/yfinance/data.py index acbe02964..97e901fb6 100644 --- a/yfinance/data.py +++ b/yfinance/data.py @@ -43,7 +43,7 @@ class TickerData: 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} def __init__(self, ticker: str, session=None): - self._ticker = ticker + self.ticker = ticker self._session = session or requests @lru_cache_freezeargs @@ -90,7 +90,7 @@ def get_financials_time_series(self, timescale, financials_data, proxy=None): acceptable_timestamps = ["annual", "quarterly"] if timescale not in acceptable_timestamps: - raise Exception("timescale '{}' must be one of: {}".format(timescale, acceptable_timestamps)) + raise ValueError("timescale '{}' must be one of: {}".format(timescale, acceptable_timestamps)) # Step 1: get the keys: def _finditem1(key, obj): @@ -109,7 +109,7 @@ def _finditem1(key, obj): # Step 2: construct url: ts_url_base = "https://query2.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{0}?symbol={0}".format( - self._ticker) + self.ticker) if len(keys) == 0: raise Exception("Fetching keys failed") url = ts_url_base + "&type=" + ",".join([timescale + k for k in keys]) diff --git a/yfinance/scrapers/__init__.py b/yfinance/scrapers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/yfinance/scrapers/analysis.py b/yfinance/scrapers/analysis.py new file mode 100644 index 000000000..4c5747b1f --- /dev/null +++ b/yfinance/scrapers/analysis.py @@ -0,0 +1,121 @@ +import pandas as pd + +from yfinance import utils +from yfinance.data import TickerData + + +class Analysis: + _SCRAPE_URL_ = 'https://finance.yahoo.com/quote' + + def __init__(self, data: TickerData, proxy=None): + self._data = data + self.proxy = proxy + + self._earnings_trend = None + self._analyst_trend_details = None + self._analyst_price_target = None + self._rev_est = None + self._eps_est = None + self._already_scraped = False + + @property + def earnings_trend(self): + if self._earnings_trend is None: + self._scrape(self.proxy) + return self._earnings_trend + + @property + def analyst_trend_details(self): + if self._analyst_trend_details is None: + self._scrape(self.proxy) + return self._analyst_trend_details + + @property + def analyst_price_target(self): + if self._analyst_price_target is None: + self._scrape(self.proxy) + return self._analyst_price_target + + @property + def rev_est(self): + if self._rev_est is None: + self._scrape(self.proxy) + return self._rev_est + + @property + def eps_est(self): + if self._eps_est is None: + self._scrape(self.proxy) + return self._eps_est + + def _scrape(self, proxy): + if self._already_scraped: + return + self._already_scraped = True + + ticker_url = "{}/{}".format(self._SCRAPE_URL_, self._data.ticker) + + # Analysis Data/Analyst Forecasts + analysis_data = self._data.get_json_data_stores(ticker_url + '/analysis', proxy) + try: + analysis_data = analysis_data['QuoteSummaryStore'] + except KeyError as e: + err_msg = "No analysis data found, symbol may be delisted" + print('- %s: %s' % (self._data.ticker, err_msg)) + return + + if isinstance(analysis_data.get('earningsTrend'), dict): + try: + analysis = pd.DataFrame(analysis_data['earningsTrend']['trend']) + analysis['endDate'] = pd.to_datetime(analysis['endDate']) + analysis.set_index('period', inplace=True) + analysis.index = analysis.index.str.upper() + analysis.index.name = 'Period' + analysis.columns = utils.camel2title(analysis.columns) + + dict_cols = [] + + for idx, row in analysis.iterrows(): + for colname, colval in row.items(): + if isinstance(colval, dict): + dict_cols.append(colname) + for k, v in colval.items(): + new_colname = colname + ' ' + \ + utils.camel2title([k])[0] + analysis.loc[idx, new_colname] = v + + self._earnings_trend = analysis[[ + c for c in analysis.columns if c not in dict_cols]] + except Exception: + pass + + try: + self._analyst_trend_details = pd.DataFrame(analysis_data['recommendationTrend']['trend']) + except Exception as e: + self._analyst_trend_details = None + try: + self._analyst_price_target = pd.DataFrame(analysis_data['financialData'], index=[0])[ + ['targetLowPrice', 'currentPrice', 'targetMeanPrice', 'targetHighPrice', 'numberOfAnalystOpinions']].T + except Exception as e: + self._analyst_price_target = None + earnings_estimate = [] + revenue_estimate = [] + if len(self._analyst_trend_details) != 0: + for key in analysis_data['earningsTrend']['trend']: + try: + earnings_dict = key['earningsEstimate'] + earnings_dict['period'] = key['period'] + earnings_dict['endDate'] = key['endDate'] + earnings_estimate.append(earnings_dict) + + revenue_dict = key['revenueEstimate'] + revenue_dict['period'] = key['period'] + revenue_dict['endDate'] = key['endDate'] + revenue_estimate.append(revenue_dict) + except Exception as e: + pass + self._rev_est = pd.DataFrame(revenue_estimate) + self._eps_est = pd.DataFrame(earnings_estimate) + else: + self._rev_est = pd.DataFrame() + self._eps_est = pd.DataFrame() diff --git a/yfinance/scrapers/fundamentals.py b/yfinance/scrapers/fundamentals.py new file mode 100644 index 000000000..8013098cd --- /dev/null +++ b/yfinance/scrapers/fundamentals.py @@ -0,0 +1,150 @@ +import pandas as pd + +from yfinance import utils +from yfinance.data import TickerData + + +class Fundamentals: + _SCRAPE_URL_ = 'https://finance.yahoo.com/quote' + + def __init__(self, data: TickerData, proxy=None): + self._data = data + self.proxy = proxy + + self.ticker_url = "{}/{}".format(self._SCRAPE_URL_, self._data.ticker) + + self._earnings = None + self._financials = None + self._shares = None + + self._financials_data = None + self._fin_data_quote = None + self._basics_already_scraped = False + self._already_scraped_financials = False + + @property + def earnings(self): + if self._earnings is None: + self._scrape_earnings(self.proxy) + return self._earnings + + @property + def financials(self): + if self._financials is None: + self._scrape_financials(self.proxy) + return self._financials + + @property + def shares(self): + if self._shares is None: + self._scrape_shares(self.proxy) + return self._shares + + def _scrape_basics(self, proxy): + if self._basics_already_scraped: + return + self._basics_already_scraped = True + + self._financials_data = self._data.get_json_data_stores(self.ticker_url + '/financials', proxy) + try: + self._fin_data_quote = self._financials_data['QuoteSummaryStore'] + except KeyError: + err_msg = "No financials data found, symbol may be delisted" + print('- %s: %s' % (self._data.ticker, err_msg)) + return None + + def _scrape_earnings(self, proxy): + self._scrape_basics(proxy) + # earnings + self._earnings = {"yearly": pd.DataFrame(), "quarterly": pd.DataFrame()} + if self._fin_data_quote is None: + return + if isinstance(self._fin_data_quote.get('earnings'), dict): + try: + earnings = self._fin_data_quote['earnings']['financialsChart'] + earnings['financialCurrency'] = self._fin_data_quote['earnings'].get('financialCurrency', 'USD') + self._earnings['financialCurrency'] = earnings['financialCurrency'] + df = pd.DataFrame(earnings['yearly']).set_index('date') + df.columns = utils.camel2title(df.columns) + df.index.name = 'Year' + self._earnings['yearly'] = df + + df = pd.DataFrame(earnings['quarterly']).set_index('date') + df.columns = utils.camel2title(df.columns) + df.index.name = 'Quarter' + self._earnings['quarterly'] = df + except Exception: + pass + + def _scrape_shares(self, proxy): + self._scrape_basics(proxy) + # shares outstanding + try: + # keep only years with non None data + available_shares = [shares_data for shares_data in + self._financials_data['QuoteTimeSeriesStore']['timeSeries']['annualBasicAverageShares'] + if + shares_data] + shares = pd.DataFrame(available_shares) + shares['Year'] = shares['asOfDate'].agg(lambda x: int(x[:4])) + shares.set_index('Year', inplace=True) + shares.drop(columns=['dataId', 'asOfDate', + 'periodType', 'currencyCode'], inplace=True) + shares.rename( + columns={'reportedValue': "BasicShares"}, inplace=True) + self._shares = shares + except Exception: + pass + + def _scrape_financials(self, proxy): + self._scrape_basics(proxy) + if self._already_scraped_financials: + return + self._already_scraped_financials = True + + # get fundamentals + self._financials = {} + for name in ["income", "balance-sheet", "cash-flow"]: + self._financials[name] = {"yearly": pd.DataFrame(), "quarterly": pd.DataFrame()} + annual, qtr = self._create_financials_table(name, proxy) + if annual is not None: + self._financials[name]["yearly"] = annual + if qtr is not None: + self._financials[name]["quarterly"] = qtr + + def _create_financials_table(self, name, proxy): + acceptable_names = ["income", "balance-sheet", "cash-flow"] + if name not in acceptable_names: + raise ValueError("name '{}' must be one of: {}".format(name, acceptable_names)) + + if name == "income": + # Yahoo stores the 'income' table internally under 'financials' key + name = "financials" + + data_stores = self._data.get_json_data_stores(self.ticker_url + '/' + name, proxy) + _stmt_annual = None + _stmt_qtr = None + try: + # Developers note: TTM and template stuff allows for reproducing the nested structure + # visible on Yahoo website. But more work needed to make it user-friendly! Ideally + # return a tree data structure instead of Pandas MultiIndex + # So until this is implemented, just return simple tables + _stmt_annual = self._data.get_financials_time_series("annual", data_stores, proxy) + _stmt_qtr = self._data.get_financials_time_series("quarterly", data_stores, proxy) + + # template_ttm_order, template_annual_order, template_order, level_detail = utils.build_template(data_store["FinancialTemplateStore"]) + # TTM_dicts, Annual_dicts = utils.retreive_financial_details(data_store['QuoteTimeSeriesStore']) + # if name == "balance-sheet": + # # Note: balance sheet is the only financial statement with no ttm detail + # _stmt_annual = utils.format_annual_financial_statement(level_detail, Annual_dicts, template_annual_order) + # else: + # _stmt_annual = utils.format_annual_financial_statement(level_detail, Annual_dicts, template_annual_order, TTM_dicts, template_ttm_order) + + # Data store doesn't contain quarterly data, so retrieve using different url: + # _qtr_data = utils.get_financials_time_series(self._ticker.ticker, name, "quarterly", ticker_url, proxy, self.session) + # _stmt_qtr = utils.format_quarterly_financial_statement(_qtr_data, level_detail, template_order) + + except Exception as e: + pass + + return _stmt_annual, _stmt_qtr diff --git a/yfinance/scrapers/holders.py b/yfinance/scrapers/holders.py new file mode 100644 index 000000000..96eeb521b --- /dev/null +++ b/yfinance/scrapers/holders.py @@ -0,0 +1,66 @@ +import pandas as pd + +from yfinance.data import TickerData + +class Holders: + _SCRAPE_URL_ = 'https://finance.yahoo.com/quote' + + def __init__(self, data: TickerData, proxy=None): + self._data = data + self.proxy = proxy + + self._major = None + self._institutional = None + self._mutualfund = None + + @property + def major(self): + if self._major is None: + self._scrape(self.proxy) + return self._major + + @property + def institutional(self): + if self._institutional is None: + self._scrape(self.proxy) + return self._institutional + + @property + def mutualfund(self): + if self._mutualfund is None: + self._scrape(self.proxy) + return self._mutualfund + + def _scrape(self, proxy): + ticker_url = "{}/{}".format(self._SCRAPE_URL_, self._data.ticker) + try: + resp = self._data.get(ticker_url + '/holders', proxy) + holders = pd.read_html(resp.text) + except Exception: + holders = [] + + if len(holders) >= 3: + self._major = holders[0] + self._institutional = holders[1] + self._mutualfund = holders[2] + elif len(holders) >= 2: + self._major = holders[0] + self._institutional = holders[1] + elif len(holders) >= 1: + self._major = holders[0] + + if self._institutional is not None: + if 'Date Reported' in self._institutional: + self._institutional['Date Reported'] = pd.to_datetime( + self._institutional['Date Reported']) + if '% Out' in self._institutional: + self._institutional['% Out'] = self._institutional[ + '% Out'].str.replace('%', '').astype(float) / 100 + + if self._mutualfund is not None: + if 'Date Reported' in self._mutualfund: + self._mutualfund['Date Reported'] = pd.to_datetime( + self._mutualfund['Date Reported']) + if '% Out' in self._mutualfund: + self._mutualfund['% Out'] = self._mutualfund[ + '% Out'].str.replace('%', '').astype(float) / 100 diff --git a/yfinance/scrapers/quote.py b/yfinance/scrapers/quote.py new file mode 100644 index 000000000..73740400f --- /dev/null +++ b/yfinance/scrapers/quote.py @@ -0,0 +1,214 @@ +import datetime +import json + +import pandas as pd + +from yfinance import utils +from yfinance.data import TickerData + + +class Quote: + _SCRAPE_URL_ = 'https://finance.yahoo.com/quote' + + def __init__(self, data: TickerData, proxy=None): + self._data = data + self.proxy = proxy + + self._info = None + self._sustainability = None + self._recommendations = None + self._calendar = None + + self._already_scraped = False + self._already_scraped_complementary = False + + @property + def info(self): + if self._info is None: + self._scrape(self.proxy) + self._scrape_complementary(self.proxy) + + return self._info + + @property + def sustainability(self): + if self._sustainability is None: + self._scrape(self.proxy) + return self._sustainability + + @property + def recommendations(self): + if self._recommendations is None: + self._scrape(self.proxy) + return self._recommendations + + @property + def calendar(self): + if self._calendar is None: + self._scrape(self.proxy) + return self._calendar + + def _scrape(self, proxy): + if self._already_scraped: + return + self._already_scraped = True + + ticker_url = "{}/{}".format(self._SCRAPE_URL_, self._data.ticker) + + # get info and sustainability + json_data = self._data.get_json_data_stores(ticker_url, proxy) + try: + data = json_data['QuoteSummaryStore'] + except KeyError: + err_msg = "No summary info found, symbol may be delisted" + print('- %s: %s' % (self._data.ticker, err_msg)) + return None + + # sustainability + d = {} + try: + if isinstance(data.get('esgScores'), dict): + for item in data['esgScores']: + if not isinstance(data['esgScores'][item], (dict, list)): + d[item] = data['esgScores'][item] + + s = pd.DataFrame(index=[0], data=d)[-1:].T + s.columns = ['Value'] + s.index.name = '%.f-%.f' % ( + s[s.index == 'ratingYear']['Value'].values[0], + s[s.index == 'ratingMonth']['Value'].values[0]) + + self._sustainability = s[~s.index.isin( + ['maxAge', 'ratingYear', 'ratingMonth'])] + except Exception: + pass + + # info (be nice to python 2) + self._info = {} + try: + items = ['summaryProfile', 'financialData', 'quoteType', + 'defaultKeyStatistics', 'assetProfile', 'summaryDetail'] + for item in items: + if isinstance(data.get(item), dict): + self._info.update(data[item]) + except Exception: + pass + + # For ETFs, provide this valuable data: the top holdings of the ETF + try: + if 'topHoldings' in data: + self._info.update(data['topHoldings']) + except Exception: + pass + + try: + if not isinstance(data.get('summaryDetail'), dict): + # For some reason summaryDetail did not give any results. The price dict + # usually has most of the same info + self._info.update(data.get('price', {})) + except Exception: + pass + + try: + # self._info['regularMarketPrice'] = self._info['regularMarketOpen'] + self._info['regularMarketPrice'] = data.get('price', {}).get( + 'regularMarketPrice', self._info.get('regularMarketOpen', None)) + except Exception: + pass + + try: + self._info['preMarketPrice'] = data.get('price', {}).get( + 'preMarketPrice', self._info.get('preMarketPrice', None)) + except Exception: + pass + + self._info['logo_url'] = "" + try: + if not 'website' in self._info: + self._info['logo_url'] = 'https://logo.clearbit.com/%s.com' % \ + self._info['shortName'].split(' ')[0].split(',')[0] + else: + domain = self._info['website'].split( + '://')[1].split('/')[0].replace('www.', '') + self._info['logo_url'] = 'https://logo.clearbit.com/%s' % domain + except Exception: + pass + + # events + try: + cal = pd.DataFrame(data['calendarEvents']['earnings']) + cal['earningsDate'] = pd.to_datetime( + cal['earningsDate'], unit='s') + self._calendar = cal.T + self._calendar.index = utils.camel2title(self._calendar.index) + self._calendar.columns = ['Value'] + except Exception as e: + pass + + # analyst recommendations + try: + rec = pd.DataFrame( + data['upgradeDowngradeHistory']['history']) + rec['earningsDate'] = pd.to_datetime( + rec['epochGradeDate'], unit='s') + rec.set_index('earningsDate', inplace=True) + rec.index.name = 'Date' + rec.columns = utils.camel2title(rec.columns) + self._recommendations = rec[[ + 'Firm', 'To Grade', 'From Grade', 'Action']].sort_index() + except Exception: + pass + + def _scrape_complementary(self, proxy): + if self._already_scraped_complementary: + return + self._already_scraped_complementary = True + + self._scrape(proxy) + if self._info is None: + return + + # Complementary key-statistics. For now just want 'trailing PEG ratio' + keys = {"trailingPegRatio"} + if keys: + # Simplified the original scrape code for key-statistics. Very expensive for fetching + # just one value, best if scraping most/all: + # + # p = _re.compile(r'root\.App\.main = (.*);') + # url = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'.format(self._ticker.ticker, self._ticker.ticker) + # try: + # r = session.get(url, headers=utils.user_agent_headers) + # data = _json.loads(p.findall(r.text)[0]) + # key_stats = data['context']['dispatcher']['stores']['QuoteTimeSeriesStore']["timeSeries"] + # for k in keys: + # if k not in key_stats or len(key_stats[k])==0: + # # Yahoo website prints N/A, indicates Yahoo lacks necessary data to calculate + # v = None + # else: + # # Select most recent (last) raw value in list: + # v = key_stats[k][-1]["reportedValue"]["raw"] + # self._info[k] = v + # except Exception: + # raise + # pass + # + # For just one/few variable is faster to query directly: + url = "https://query1.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{}?symbol={}".format( + self._data.ticker, self._data.ticker) + for k in keys: + url += "&type=" + k + # Request 6 months of data + url += "&period1={}".format( + int((datetime.datetime.now() - datetime.timedelta(days=365 // 2)).timestamp())) + url += "&period2={}".format(int((datetime.datetime.now() + datetime.timedelta(days=1)).timestamp())) + + json_str = self._data.get(url=url, proxy=proxy).text + json_data = json.loads(json_str) + key_stats = json_data["timeseries"]["result"][0] + if k not in key_stats: + # Yahoo website prints N/A, indicates Yahoo lacks necessary data to calculate + v = None + else: + # Select most recent (last) raw value in list: + v = key_stats[k][-1]["reportedValue"]["raw"] + self._info[k] = v diff --git a/yfinance/ticker.py b/yfinance/ticker.py index 0c821ebf8..b17a179eb 100644 --- a/yfinance/ticker.py +++ b/yfinance/ticker.py @@ -30,6 +30,9 @@ class Ticker(TickerBase): + def __init__(self, ticker, session=None): + super(Ticker, self).__init__(ticker, session=session) + self._expirations = {} def __repr__(self): return 'yfinance.Ticker object <%s>' % self.ticker From f4b3348c8e8960ab7edc5a6a96e1ce7a6a86e639 Mon Sep 17 00:00:00 2001 From: Fredrik Corneliusson Date: Fri, 11 Nov 2022 00:02:30 +0100 Subject: [PATCH 04/41] Continued refactoring Fix for #1171 Change default start to 1900-01-01 Refactored financials to remove unnecessary requests Dividends not working on windows (DEV) Add typehints to API --- tests/prices.py | 2 - tests/ticker.py | 99 ++++++++++++++++ yfinance/base.py | 90 ++++++++------- yfinance/data.py | 88 +++------------ yfinance/exceptions.py | 6 + yfinance/scrapers/analysis.py | 17 ++- yfinance/scrapers/fundamentals.py | 182 +++++++++++++++++++++--------- yfinance/scrapers/holders.py | 6 +- yfinance/scrapers/quote.py | 44 ++++---- yfinance/ticker.py | 54 ++++----- 10 files changed, 356 insertions(+), 232 deletions(-) create mode 100644 yfinance/exceptions.py diff --git a/tests/prices.py b/tests/prices.py index d5ef0bcbc..92718b057 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -10,7 +10,6 @@ import requests_cache - class TestPriceHistory(unittest.TestCase): session = None @@ -443,4 +442,3 @@ def test_repair_daily_zeroes(self): # test_src.index(f"def {x}") - test_src.index(f"def {y}") # ) # unittest.main(verbosity=2) - diff --git a/tests/ticker.py b/tests/ticker.py index 5a67a2a39..dd2e990fe 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -86,6 +86,70 @@ def test_badTicker(self): dat.earnings_dates dat.earnings_forecasts + def test_goodTicker(self): + # that yfinance works when full api is called on same instance of ticker + + tkr = "IBM" + dat = yf.Ticker(tkr, session=self.session) + + dat.isin + dat.major_holders + dat.institutional_holders + dat.mutualfund_holders + dat.dividends + dat.splits + dat.actions + dat.shares + dat.info + dat.calendar + dat.recommendations + dat.earnings + dat.quarterly_earnings + dat.income_stmt + dat.quarterly_income_stmt + dat.balance_sheet + dat.quarterly_balance_sheet + dat.cashflow + dat.quarterly_cashflow + dat.recommendations_summary + dat.analyst_price_target + dat.revenue_forecasts + dat.sustainability + dat.options + dat.news + dat.earnings_trend + dat.earnings_dates + dat.earnings_forecasts + + dat.history(period="1wk") + dat.history(start="2022-01-01") + dat.history(start="2022-01-01", end="2022-03-01") + yf.download([tkr], period="1wk") + + +class TestTickerHistory(unittest.TestCase): + def setUp(self): + # use a ticker that has dividends + self.ticker = yf.Ticker("IBM") + + def tearDown(self): + self.ticker = None + + def test_dividends(self): + data = self.ticker.dividends + self.assertIsInstance(data, pd.Series, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + + def test_splits(self): + data = self.ticker.splits + self.assertIsInstance(data, pd.Series, "data has wrong type") + # self.assertFalse(data.empty, "data is empty") + + def test_actions(self): + data = self.ticker.actions + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + class TestTickerEarnings(unittest.TestCase): @@ -185,6 +249,26 @@ def setUp(self): def tearDown(self): self.ticker = None + def test_income_statement(self): + expected_row = "TotalRevenue" + data = self.ticker.income_stmt + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + self.assertIn(expected_row, data.index, "Did not find expected row in index") + + data_cached = self.ticker.income_stmt + self.assertIs(data, data_cached, "data not cached") + + def test_quarterly_income_statement(self): + expected_row = "TotalRevenue" + data = self.ticker.quarterly_income_stmt + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + self.assertIn(expected_row, data.index, "Did not find expected row in index") + + data_cached = self.ticker.quarterly_income_stmt + self.assertIs(data, data_cached, "data not cached") + def test_balance_sheet(self): expected_row = "TotalAssets" data = self.ticker.balance_sheet @@ -286,12 +370,27 @@ def test_options(self): self.assertIsInstance(data, tuple, "data has wrong type") self.assertTrue(len(data) > 1, "data is empty") + def test_shares(self): + data = self.ticker.shares + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + + def test_info(self): + data = self.ticker.info + self.assertIsInstance(data, dict, "data has wrong type") + self.assertIn("symbol", data.keys(), "Did not find expected key in info dict") + self.assertEqual("GOOGL", data["symbol"], "Wrong symbol value in info dict") + + def test_bad_freq_value_raises_exception(self): + self.assertRaises(ValueError, lambda : self.ticker.get_cashflow(freq="badarg")) + def suite(): suite = unittest.TestSuite() suite.addTest(TestTicker('Test ticker')) suite.addTest(TestTickerEarnings('Test earnings')) suite.addTest(TestTickerHolders('Test holders')) + suite.addTest(TestTickerHistory('Test Ticker history')) suite.addTest(TestTickerMiscFinancials('Test misc financials')) return suite diff --git a/yfinance/base.py b/yfinance/base.py index 7a4e1ced5..cb6f9c293 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -23,8 +23,11 @@ import time as _time import datetime as _datetime +from typing import Optional + import pandas as _pd import numpy as _np +import pandas as pd from .data import TickerData @@ -77,14 +80,14 @@ def stats(self, proxy=None): ticker_url = "{}/{}".format(self._scrape_url, self.ticker) # get info and sustainability - data = self._data.get_json_data_stores(ticker_url, proxy)["QuoteSummaryStore"] + data = self._data.get_json_data_stores(proxy=proxy)["QuoteSummaryStore"] return data def history(self, period="1mo", interval="1d", start=None, end=None, prepost=False, actions=True, auto_adjust=True, back_adjust=False, repair=False, keepna=False, proxy=None, rounding=False, timeout=10, - debug=True, raise_errors=False): + debug=True, raise_errors=False) -> pd.DataFrame: """ :Parameters: period : str @@ -152,8 +155,8 @@ def history(self, period="1mo", interval="1d", if interval == "1m": start = end - 604800 # Subtract 7 days else: - #time stamp of 01/01/1900 - start = -2208994789 + _UNIX_TIMESTAMP_1900 = -2208994789 + start = _UNIX_TIMESTAMP_1900 else: start = utils._parse_user_dt(start, tz) params = {"period1": start, "period2": end} @@ -299,13 +302,15 @@ def history(self, period="1mo", interval="1d", # actions dividends, splits = utils.parse_actions(data["chart"]["result"][0]) if start is not None: - startDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(start)) + # Note: use pandas Timestamp as datetime.utcfromtimestamp has bugs on windows + # https://github.com/python/cpython/issues/81708 + startDt = _pd.Timestamp(start, unit='s') if dividends is not None: dividends = dividends[dividends.index >= startDt] if splits is not None: splits = splits[splits.index >= startDt] if end is not None: - endDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(end)) + endDt = _pd.Timestamp(end, unit='s') if dividends is not None: dividends = dividends[dividends.index < endDt] if splits is not None: @@ -362,10 +367,10 @@ def history(self, period="1mo", interval="1d", def _reconstruct_interval(self, df_row, interval, bad_fields): if isinstance(df_row, _pd.DataFrame) or not isinstance(df_row, _pd.Series): raise Exception("'df_row' must be a Pandas Series not", type(df_row)) - if not isinstance(bad_fields, (list,set,_np.ndarray)): + if not isinstance(bad_fields, (list, set, _np.ndarray)): raise Exception("'bad_fields' must be a list/set not", type(bad_fields)) - data_cols = [c for c in ["Open","High","Low","Close","Adj Close"] if c in df_row.index] + data_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df_row.index] # If interval is weekly then can construct with daily. But if smaller intervals then # restricted to recent times: @@ -386,56 +391,59 @@ def _reconstruct_interval(self, df_row, interval, bad_fields): idx = df_row.name start = idx.date() - if sub_interval=="1h" and (_datetime.date.today()-start) > _datetime.timedelta(days=729): + if sub_interval == "1h" and (_datetime.date.today() - start) > _datetime.timedelta(days=729): # Don't bother requesting more price data, Yahoo will reject return None else: new_vals = {} - if sub_interval=="1h": - df_fine = self.history(start=start, end=start+td_range, interval=sub_interval, auto_adjust=False) + if sub_interval == "1h": + df_fine = self.history(start=start, end=start + td_range, interval=sub_interval, auto_adjust=False) else: - df_fine = self.history(start=start-td_range, end=start+td_range, interval=sub_interval, auto_adjust=False) + df_fine = self.history(start=start - td_range, end=start + td_range, interval=sub_interval, + auto_adjust=False) # First, check whether df_fine has different split-adjustment than df_row. # If it is different, then adjust df_fine to match df_row - good_fields = list(set(data_cols)-set(bad_fields)-set("Adj Close")) - if len(good_fields)==0: - raise Exception("No good fields, so cannot determine whether different split-adjustment. Contact developers") + good_fields = list(set(data_cols) - set(bad_fields) - set("Adj Close")) + if len(good_fields) == 0: + raise Exception( + "No good fields, so cannot determine whether different split-adjustment. Contact developers") # median = df_row.loc[good_fields].median() # median_fine = _np.median(df_fine[good_fields].values) # ratio = median/median_fine # Better method to calculate split-adjustment: - df_fine_from_idx = df_fine[df_fine.index>=idx] + df_fine_from_idx = df_fine[df_fine.index >= idx] ratios = [] for f in good_fields: - if f=="Low": + if f == "Low": ratios.append(df_row[f] / df_fine_from_idx[f].min()) - elif f=="High": + elif f == "High": ratios.append(df_row[f] / df_fine_from_idx[f].max()) - elif f=="Open": + elif f == "Open": ratios.append(df_row[f] / df_fine_from_idx[f].iloc[0]) - elif f=="Close": + elif f == "Close": ratios.append(df_row[f] / df_fine_from_idx[f].iloc[-1]) ratio = _np.mean(ratios) # - ratio_rcp = round(1.0/ratio, 1) ; ratio = round(ratio, 1) - if ratio==1 and ratio_rcp==1: + ratio_rcp = round(1.0 / ratio, 1) + ratio = round(ratio, 1) + if ratio == 1 and ratio_rcp == 1: # Good! pass else: - if ratio>1: + if ratio > 1: # data has different split-adjustment than fine-grained data # Adjust fine-grained to match df_fine[data_cols] *= ratio - elif ratio_rcp>1: + elif ratio_rcp > 1: # data has different split-adjustment than fine-grained data # Adjust fine-grained to match - df_fine[data_cols] *= 1.0/ratio_rcp + df_fine[data_cols] *= 1.0 / ratio_rcp if sub_interval != "1h": - df_last_week = df_fine[df_fine.index=idx] + df_last_week = df_fine[df_fine.index < idx] + df_fine = df_fine[df_fine.index >= idx] if "High" in bad_fields: new_vals["High"] = df_fine["High"].max() @@ -487,7 +495,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): if (median == 0).any(): raise Exception("median contains zeroes, why?") ratio = df2[data_cols].values / median - ratio_rounded = (ratio / 20).round() * 20 # round ratio to nearest 20 + ratio_rounded = (ratio / 20).round() * 20 # round ratio to nearest 20 f = ratio_rounded == 100 # Store each mixup: @@ -499,7 +507,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): for i in _np.where(fj)[0]: idx = df2.index[i] if idx not in mixups: - mixups[idx] = {"data": df2.loc[idx, data_cols], "fields":{dc}} + mixups[idx] = {"data": df2.loc[idx, data_cols], "fields": {dc}} else: mixups[idx]["fields"].add(dc) n_mixups = len(mixups) @@ -555,22 +563,22 @@ def _fix_zero_prices(self, df, interval, tz_exchange): else: df2.index = df2.index.tz_convert(tz_exchange) - data_cols = ["Open","High","Low","Close"] + data_cols = ["Open", "High", "Low", "Close"] data_cols = [c for c in data_cols if c in df2.columns] - f_zeroes = (df2[data_cols]==0.0).values.any(axis=1) + f_zeroes = (df2[data_cols] == 0.0).values.any(axis=1) n_fixed = 0 for i in _np.where(f_zeroes)[0]: idx = df2.index[i] df_row = df2.loc[idx] - bad_fields = df2.columns[df_row.values==0.0].values + bad_fields = df2.columns[df_row.values == 0.0].values new_values = self._reconstruct_interval(df2.loc[idx], interval, bad_fields) if not new_values is None: for k in new_values: df2.loc[idx, k] = new_values[k] n_fixed += 1 - if n_fixed>0: + if n_fixed > 0: print("{}: fixed {} price=0.0 errors in {} price data".format(self.ticker, n_fixed, interval)) return df2 @@ -667,11 +675,9 @@ def get_mutualfund_holders(self, proxy=None, as_dict=False): return data.to_dict() return data - def get_info(self, proxy=None, as_dict=False): + def get_info(self, proxy=None) -> dict: self._quote.proxy = proxy data = self._quote.info - if as_dict: - return data.to_dict() return data def get_sustainability(self, proxy=None, as_dict=False): @@ -735,21 +741,21 @@ def get_earnings(self, proxy=None, as_dict=False, freq="yearly"): def get_income_stmt(self, proxy=None, as_dict=False, freq="yearly"): self._fundamentals.proxy = proxy - data = self._fundamentals.financials["income"][freq] + data = self._fundamentals.financials.get_income(freq=freq, proxy=proxy) if as_dict: return data.to_dict() return data def get_balance_sheet(self, proxy=None, as_dict=False, freq="yearly"): self._fundamentals.proxy = proxy - data = self._fundamentals.financials["balance-sheet"][freq] + data = self._fundamentals.financials.get_balance_sheet(freq=freq, proxy=proxy) if as_dict: return data.to_dict() return data def get_cashflow(self, proxy=None, as_dict=False, freq="yearly"): self._fundamentals.proxy = proxy - data = self._fundamentals.financials["cash-flow"][freq] + data = self._fundamentals.financials.get_cash_flow(freq=freq, proxy=proxy) if as_dict: return data.to_dict() return data @@ -785,7 +791,7 @@ def get_shares(self, proxy=None, as_dict=False): return data.to_dict() return data - def get_isin(self, proxy=None): + def get_isin(self, proxy=None) -> Optional[str]: # *** experimental *** if self._isin is not None: return self._isin @@ -841,7 +847,7 @@ def get_news(self, proxy=None): self._news = data.get("news", []) return self._news - def get_earnings_dates(self, proxy=None): + def get_earnings_dates(self, proxy=None) -> Optional[pd.DataFrame]: if self._earnings_dates is not None: return self._earnings_dates @@ -914,7 +920,7 @@ def get_earnings_dates(self, proxy=None): return dates - def get_earnings_history(self, proxy=None): + def get_earnings_history(self, proxy=None) -> Optional[pd.DataFrame]: if self._earnings_history is not None: return self._earnings_history diff --git a/yfinance/data.py b/yfinance/data.py index 97e901fb6..ef3f6cde1 100644 --- a/yfinance/data.py +++ b/yfinance/data.py @@ -1,8 +1,6 @@ -import datetime import functools from functools import lru_cache -import pandas as pd import requests as requests import re @@ -18,14 +16,16 @@ def lru_cache_freezeargs(func): """ - Decorator transforms mutable dictionary arguments into immutable - Needed so lru_cache can cache method calls what has dict arguments. + Decorator transforms mutable dictionary and list arguments into immutable types + Needed so lru_cache can cache method calls what has dict or list arguments. """ @functools.wraps(func) def wrapped(*args, **kwargs): args = tuple([frozendict(arg) if isinstance(arg, dict) else arg for arg in args]) kwargs = {k: frozendict(v) if isinstance(v, dict) else v for k, v in kwargs.items()} + args = tuple([tuple(arg) if isinstance(arg, list) else arg for arg in args]) + kwargs = {k: tuple(v) if isinstance(v, list) else v for k, v in kwargs.items()} return func(*args, **kwargs) # copy over the lru_cache extra methods to this wrapper to be able to access them @@ -35,6 +35,9 @@ def wrapped(*args, **kwargs): return wrapped +_SCRAPE_URL_ = 'https://finance.yahoo.com/quote' + + class TickerData: """ Have one place to retrieve data from Yahoo API in order to ease caching and speed up operations @@ -68,12 +71,18 @@ def _get_proxy(self, proxy): @lru_cache_freezeargs @lru_cache(maxsize=cache_maxsize) - def get_json_data_stores(self, url, proxy=None): + def get_json_data_stores(self, sub_page: str = None, proxy=None) -> dict: ''' get_json_data_stores returns a python dictionary of the data stores in yahoo finance web page. ''' - html = self.get(url=url, proxy=proxy).text + if sub_page: + ticker_url = "{}/{}/{}".format(_SCRAPE_URL_, self.ticker, sub_page) + else: + ticker_url = "{}/{}".format(_SCRAPE_URL_, self.ticker) + html = self.get(url=ticker_url, proxy=proxy).text + + # The actual json-data for stores is in a javascript assignment in the webpage json_str = html.split('root.App.main =')[1].split( '(this)')[0].split(';\n}')[0].strip() data = json.loads(json_str)['context']['dispatcher']['stores'] @@ -84,70 +93,3 @@ def get_json_data_stores(self, url, proxy=None): r'{[\'|\"]raw[\'|\"]:(.*?),(.*?)}', r'\1', new_data) return json.loads(new_data) - - # Note cant use lru_cache as financials_data is a nested dict (freezeargs only handle flat dicts) - def get_financials_time_series(self, timescale, financials_data, proxy=None): - - acceptable_timestamps = ["annual", "quarterly"] - if timescale not in acceptable_timestamps: - raise ValueError("timescale '{}' must be one of: {}".format(timescale, acceptable_timestamps)) - - # Step 1: get the keys: - def _finditem1(key, obj): - values = [] - if isinstance(obj, dict): - if key in obj.keys(): - values.append(obj[key]) - for k, v in obj.items(): - values += _finditem1(key, v) - elif isinstance(obj, list): - for v in obj: - values += _finditem1(key, v) - return values - - keys = _finditem1("key", financials_data['FinancialTemplateStore']) - - # Step 2: construct url: - ts_url_base = "https://query2.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{0}?symbol={0}".format( - self.ticker) - if len(keys) == 0: - raise Exception("Fetching keys failed") - url = ts_url_base + "&type=" + ",".join([timescale + k for k in keys]) - # Yahoo returns maximum 4 years or 5 quarters, regardless of start_dt: - start_dt = datetime.datetime(2016, 12, 31) - end = (datetime.datetime.now() + datetime.timedelta(days=366)) - url += "&period1={}&period2={}".format(int(start_dt.timestamp()), int(end.timestamp())) - - # Step 3: fetch and reshape data - json_str = self.get(url=url, proxy=proxy).text - json_data = json.loads(json_str) - data_raw = json_data["timeseries"]["result"] - # data_raw = [v for v in data_raw if len(v) > 1] # Discard keys with no data - for d in data_raw: - del d["meta"] - - # Now reshape data into a table: - # Step 1: get columns and index: - timestamps = set() - data_unpacked = {} - for x in data_raw: - for k in x.keys(): - if k == "timestamp": - timestamps.update(x[k]) - else: - data_unpacked[k] = x[k] - timestamps = sorted(list(timestamps)) - dates = pd.to_datetime(timestamps, unit="s") - df = pd.DataFrame(columns=dates, index=list(data_unpacked.keys())) - for k, v in data_unpacked.items(): - if df is None: - df = pd.DataFrame(columns=dates, index=[k]) - df.loc[k] = {pd.Timestamp(x["asOfDate"]): x["reportedValue"]["raw"] for x in v} - - df.index = df.index.str.replace("^" + timescale, "", regex=True) - - # Reorder table to match order on Yahoo website - df = df.reindex([k for k in keys if k in df.index]) - df = df[sorted(df.columns, reverse=True)] - - return df diff --git a/yfinance/exceptions.py b/yfinance/exceptions.py new file mode 100644 index 000000000..866a3c620 --- /dev/null +++ b/yfinance/exceptions.py @@ -0,0 +1,6 @@ +class YFianceException(Exception): + pass + + +class YFianceDataException(YFianceException): + pass diff --git a/yfinance/scrapers/analysis.py b/yfinance/scrapers/analysis.py index 4c5747b1f..e381b01f1 100644 --- a/yfinance/scrapers/analysis.py +++ b/yfinance/scrapers/analysis.py @@ -5,7 +5,6 @@ class Analysis: - _SCRAPE_URL_ = 'https://finance.yahoo.com/quote' def __init__(self, data: TickerData, proxy=None): self._data = data @@ -19,31 +18,31 @@ def __init__(self, data: TickerData, proxy=None): self._already_scraped = False @property - def earnings_trend(self): + def earnings_trend(self) -> pd.DataFrame: if self._earnings_trend is None: self._scrape(self.proxy) return self._earnings_trend @property - def analyst_trend_details(self): + def analyst_trend_details(self) -> pd.DataFrame: if self._analyst_trend_details is None: self._scrape(self.proxy) return self._analyst_trend_details @property - def analyst_price_target(self): + def analyst_price_target(self) -> pd.DataFrame: if self._analyst_price_target is None: self._scrape(self.proxy) return self._analyst_price_target @property - def rev_est(self): + def rev_est(self) -> pd.DataFrame: if self._rev_est is None: self._scrape(self.proxy) return self._rev_est @property - def eps_est(self): + def eps_est(self) -> pd.DataFrame: if self._eps_est is None: self._scrape(self.proxy) return self._eps_est @@ -53,10 +52,8 @@ def _scrape(self, proxy): return self._already_scraped = True - ticker_url = "{}/{}".format(self._SCRAPE_URL_, self._data.ticker) - # Analysis Data/Analyst Forecasts - analysis_data = self._data.get_json_data_stores(ticker_url + '/analysis', proxy) + analysis_data = self._data.get_json_data_stores("analysis", proxy=proxy) try: analysis_data = analysis_data['QuoteSummaryStore'] except KeyError as e: @@ -100,7 +97,7 @@ def _scrape(self, proxy): self._analyst_price_target = None earnings_estimate = [] revenue_estimate = [] - if len(self._analyst_trend_details) != 0: + if self._analyst_trend_details is not None : for key in analysis_data['earningsTrend']['trend']: try: earnings_dict = key['earningsEstimate'] diff --git a/yfinance/scrapers/fundamentals.py b/yfinance/scrapers/fundamentals.py index 8013098cd..a0cc9cc56 100644 --- a/yfinance/scrapers/fundamentals.py +++ b/yfinance/scrapers/fundamentals.py @@ -1,18 +1,19 @@ +import datetime +import json + import pandas as pd from yfinance import utils from yfinance.data import TickerData +from yfinance.exceptions import YFianceDataException, YFianceException class Fundamentals: - _SCRAPE_URL_ = 'https://finance.yahoo.com/quote' def __init__(self, data: TickerData, proxy=None): self._data = data self.proxy = proxy - self.ticker_url = "{}/{}".format(self._SCRAPE_URL_, self._data.ticker) - self._earnings = None self._financials = None self._shares = None @@ -20,22 +21,20 @@ def __init__(self, data: TickerData, proxy=None): self._financials_data = None self._fin_data_quote = None self._basics_already_scraped = False - self._already_scraped_financials = False + self._financials = Fiancials(data) + + @property + def financials(self) -> "Fiancials": + return self._financials @property - def earnings(self): + def earnings(self) -> dict: if self._earnings is None: self._scrape_earnings(self.proxy) return self._earnings @property - def financials(self): - if self._financials is None: - self._scrape_financials(self.proxy) - return self._financials - - @property - def shares(self): + def shares(self) -> pd.DataFrame: if self._shares is None: self._scrape_shares(self.proxy) return self._shares @@ -45,7 +44,7 @@ def _scrape_basics(self, proxy): return self._basics_already_scraped = True - self._financials_data = self._data.get_json_data_stores(self.ticker_url + '/financials', proxy) + self._financials_data = self._data.get_json_data_stores('financials', proxy) try: self._fin_data_quote = self._financials_data['QuoteSummaryStore'] except KeyError: @@ -96,55 +95,136 @@ def _scrape_shares(self, proxy): except Exception: pass - def _scrape_financials(self, proxy): - self._scrape_basics(proxy) - if self._already_scraped_financials: - return - self._already_scraped_financials = True - - # get fundamentals - self._financials = {} - for name in ["income", "balance-sheet", "cash-flow"]: - self._financials[name] = {"yearly": pd.DataFrame(), "quarterly": pd.DataFrame()} - annual, qtr = self._create_financials_table(name, proxy) - if annual is not None: - self._financials[name]["yearly"] = annual - if qtr is not None: - self._financials[name]["quarterly"] = qtr - - def _create_financials_table(self, name, proxy): - acceptable_names = ["income", "balance-sheet", "cash-flow"] - if name not in acceptable_names: - raise ValueError("name '{}' must be one of: {}".format(name, acceptable_names)) +class Fiancials: + def __init__(self, data: TickerData): + self._data = data + self._income = {} + self._balance_sheet = {} + self._cash_flow = {} + + def get_income(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._income + if freq not in res: + res[freq] = self._scrape("income", freq, proxy=None) + return res[freq] + + def get_balance_sheet(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._balance_sheet + if freq not in res: + res[freq] = self._scrape("balance-sheet", freq, proxy=None) + return res[freq] + + def get_cash_flow(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._cash_flow + if freq not in res: + res[freq] = self._scrape("cash-flow", freq, proxy=None) + return res[freq] + + def _scrape(self, name, timescale, proxy=None): + allowed_names = ["income", "balance-sheet", "cash-flow"] + allowed_timescales = ["yearly", "quarterly"] + + if name not in allowed_names: + raise ValueError("Illegal argument: name must be one of: {}".format(allowed_names)) + if timescale not in allowed_timescales: + raise ValueError("Illegal argument: timescale must be one of: {}".format(allowed_names)) + + try: + statement = self._create_financials_table(name, timescale, proxy) + if statement is not None: + return statement + except YFianceException as e: + print("Failed to create financials table for {} reason: {}".format(name, repr(e))) + return pd.DataFrame() + + def _create_financials_table(self, name, timescale, proxy): if name == "income": # Yahoo stores the 'income' table internally under 'financials' key name = "financials" - data_stores = self._data.get_json_data_stores(self.ticker_url + '/' + name, proxy) - _stmt_annual = None - _stmt_qtr = None + keys = self._get_datastore_keys(name, proxy) + try: # Developers note: TTM and template stuff allows for reproducing the nested structure # visible on Yahoo website. But more work needed to make it user-friendly! Ideally # return a tree data structure instead of Pandas MultiIndex # So until this is implemented, just return simple tables - _stmt_annual = self._data.get_financials_time_series("annual", data_stores, proxy) - _stmt_qtr = self._data.get_financials_time_series("quarterly", data_stores, proxy) - - # template_ttm_order, template_annual_order, template_order, level_detail = utils.build_template(data_store["FinancialTemplateStore"]) - # TTM_dicts, Annual_dicts = utils.retreive_financial_details(data_store['QuoteTimeSeriesStore']) - # if name == "balance-sheet": - # # Note: balance sheet is the only financial statement with no ttm detail - # _stmt_annual = utils.format_annual_financial_statement(level_detail, Annual_dicts, template_annual_order) - # else: - # _stmt_annual = utils.format_annual_financial_statement(level_detail, Annual_dicts, template_annual_order, TTM_dicts, template_ttm_order) - - # Data store doesn't contain quarterly data, so retrieve using different url: - # _qtr_data = utils.get_financials_time_series(self._ticker.ticker, name, "quarterly", ticker_url, proxy, self.session) - # _stmt_qtr = utils.format_quarterly_financial_statement(_qtr_data, level_detail, template_order) + return self.get_financials_time_series(timescale, keys, proxy) except Exception as e: pass - return _stmt_annual, _stmt_qtr + def _get_datastore_keys(self, sub_page, proxy) -> list: + data_stores = self._data.get_json_data_stores(sub_page, proxy) + + # Step 1: get the keys: + def _finditem1(key, obj): + values = [] + if isinstance(obj, dict): + if key in obj.keys(): + values.append(obj[key]) + for k, v in obj.items(): + values += _finditem1(key, v) + elif isinstance(obj, list): + for v in obj: + values += _finditem1(key, v) + return values + + try: + keys = _finditem1("key", data_stores['FinancialTemplateStore']) + except KeyError as e: + raise YFianceDataException("Parsing FinancialTemplateStore failed, reason: {}".format(repr(e))) + + if not keys: + raise YFianceDataException("No keys in FinancialTemplateStore") + return keys + + def get_financials_time_series(self, timescale, keys: list, proxy=None) -> pd.DataFrame: + timescale_translation = {"yearly": "annual", "quarterly": "quarterly"} + timescale = timescale_translation[timescale] + + # Step 2: construct url: + ts_url_base = \ + "https://query2.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{0}?symbol={0}" \ + .format(self._data.ticker) + + url = ts_url_base + "&type=" + ",".join([timescale + k for k in keys]) + # Yahoo returns maximum 4 years or 5 quarters, regardless of start_dt: + start_dt = datetime.datetime(2016, 12, 31) + end = (datetime.datetime.now() + datetime.timedelta(days=366)) + url += "&period1={}&period2={}".format(int(start_dt.timestamp()), int(end.timestamp())) + + # Step 3: fetch and reshape data + json_str = self._data.get(url=url, proxy=proxy).text + json_data = json.loads(json_str) + data_raw = json_data["timeseries"]["result"] + # data_raw = [v for v in data_raw if len(v) > 1] # Discard keys with no data + for d in data_raw: + del d["meta"] + + # Now reshape data into a table: + # Step 1: get columns and index: + timestamps = set() + data_unpacked = {} + for x in data_raw: + for k in x.keys(): + if k == "timestamp": + timestamps.update(x[k]) + else: + data_unpacked[k] = x[k] + timestamps = sorted(list(timestamps)) + dates = pd.to_datetime(timestamps, unit="s") + df = pd.DataFrame(columns=dates, index=list(data_unpacked.keys())) + for k, v in data_unpacked.items(): + if df is None: + df = pd.DataFrame(columns=dates, index=[k]) + df.loc[k] = {pd.Timestamp(x["asOfDate"]): x["reportedValue"]["raw"] for x in v} + + df.index = df.index.str.replace("^" + timescale, "", regex=True) + + # Reorder table to match order on Yahoo website + df = df.reindex([k for k in keys if k in df.index]) + df = df[sorted(df.columns, reverse=True)] + + return df diff --git a/yfinance/scrapers/holders.py b/yfinance/scrapers/holders.py index 96eeb521b..c130c22fa 100644 --- a/yfinance/scrapers/holders.py +++ b/yfinance/scrapers/holders.py @@ -14,19 +14,19 @@ def __init__(self, data: TickerData, proxy=None): self._mutualfund = None @property - def major(self): + def major(self) -> pd.DataFrame: if self._major is None: self._scrape(self.proxy) return self._major @property - def institutional(self): + def institutional(self) -> pd.DataFrame: if self._institutional is None: self._scrape(self.proxy) return self._institutional @property - def mutualfund(self): + def mutualfund(self) -> pd.DataFrame: if self._mutualfund is None: self._scrape(self.proxy) return self._mutualfund diff --git a/yfinance/scrapers/quote.py b/yfinance/scrapers/quote.py index 73740400f..a0f1dac6d 100644 --- a/yfinance/scrapers/quote.py +++ b/yfinance/scrapers/quote.py @@ -8,7 +8,6 @@ class Quote: - _SCRAPE_URL_ = 'https://finance.yahoo.com/quote' def __init__(self, data: TickerData, proxy=None): self._data = data @@ -23,7 +22,7 @@ def __init__(self, data: TickerData, proxy=None): self._already_scraped_complementary = False @property - def info(self): + def info(self) -> dict: if self._info is None: self._scrape(self.proxy) self._scrape_complementary(self.proxy) @@ -31,19 +30,19 @@ def info(self): return self._info @property - def sustainability(self): + def sustainability(self) -> pd.DataFrame: if self._sustainability is None: self._scrape(self.proxy) return self._sustainability @property - def recommendations(self): + def recommendations(self) -> pd.DataFrame: if self._recommendations is None: self._scrape(self.proxy) return self._recommendations @property - def calendar(self): + def calendar(self) -> pd.DataFrame: if self._calendar is None: self._scrape(self.proxy) return self._calendar @@ -53,12 +52,10 @@ def _scrape(self, proxy): return self._already_scraped = True - ticker_url = "{}/{}".format(self._SCRAPE_URL_, self._data.ticker) - # get info and sustainability - json_data = self._data.get_json_data_stores(ticker_url, proxy) + json_data = self._data.get_json_data_stores(proxy=proxy) try: - data = json_data['QuoteSummaryStore'] + quote_summary_store = json_data['QuoteSummaryStore'] except KeyError: err_msg = "No summary info found, symbol may be delisted" print('- %s: %s' % (self._data.ticker, err_msg)) @@ -67,10 +64,10 @@ def _scrape(self, proxy): # sustainability d = {} try: - if isinstance(data.get('esgScores'), dict): - for item in data['esgScores']: - if not isinstance(data['esgScores'][item], (dict, list)): - d[item] = data['esgScores'][item] + if isinstance(quote_summary_store.get('esgScores'), dict): + for item in quote_summary_store['esgScores']: + if not isinstance(quote_summary_store['esgScores'][item], (dict, list)): + d[item] = quote_summary_store['esgScores'][item] s = pd.DataFrame(index=[0], data=d)[-1:].T s.columns = ['Value'] @@ -83,41 +80,40 @@ def _scrape(self, proxy): except Exception: pass - # info (be nice to python 2) self._info = {} try: items = ['summaryProfile', 'financialData', 'quoteType', 'defaultKeyStatistics', 'assetProfile', 'summaryDetail'] for item in items: - if isinstance(data.get(item), dict): - self._info.update(data[item]) + if isinstance(quote_summary_store.get(item), dict): + self._info.update(quote_summary_store[item]) except Exception: pass # For ETFs, provide this valuable data: the top holdings of the ETF try: - if 'topHoldings' in data: - self._info.update(data['topHoldings']) + if 'topHoldings' in quote_summary_store: + self._info.update(quote_summary_store['topHoldings']) except Exception: pass try: - if not isinstance(data.get('summaryDetail'), dict): + if not isinstance(quote_summary_store.get('summaryDetail'), dict): # For some reason summaryDetail did not give any results. The price dict # usually has most of the same info - self._info.update(data.get('price', {})) + self._info.update(quote_summary_store.get('price', {})) except Exception: pass try: # self._info['regularMarketPrice'] = self._info['regularMarketOpen'] - self._info['regularMarketPrice'] = data.get('price', {}).get( + self._info['regularMarketPrice'] = quote_summary_store.get('price', {}).get( 'regularMarketPrice', self._info.get('regularMarketOpen', None)) except Exception: pass try: - self._info['preMarketPrice'] = data.get('price', {}).get( + self._info['preMarketPrice'] = quote_summary_store.get('price', {}).get( 'preMarketPrice', self._info.get('preMarketPrice', None)) except Exception: pass @@ -136,7 +132,7 @@ def _scrape(self, proxy): # events try: - cal = pd.DataFrame(data['calendarEvents']['earnings']) + cal = pd.DataFrame(quote_summary_store['calendarEvents']['earnings']) cal['earningsDate'] = pd.to_datetime( cal['earningsDate'], unit='s') self._calendar = cal.T @@ -148,7 +144,7 @@ def _scrape(self, proxy): # analyst recommendations try: rec = pd.DataFrame( - data['upgradeDowngradeHistory']['history']) + quote_summary_store['upgradeDowngradeHistory']['history']) rec['earningsDate'] = pd.to_datetime( rec['epochGradeDate'], unit='s') rec.set_index('earningsDate', inplace=True) diff --git a/yfinance/ticker.py b/yfinance/ticker.py index b17a179eb..0561ca891 100644 --- a/yfinance/ticker.py +++ b/yfinance/ticker.py @@ -102,39 +102,39 @@ def isin(self): return self.get_isin() @property - def major_holders(self): + def major_holders(self) -> _pd.DataFrame: return self.get_major_holders() @property - def institutional_holders(self): + def institutional_holders(self) -> _pd.DataFrame: return self.get_institutional_holders() @property - def mutualfund_holders(self): + def mutualfund_holders(self) -> _pd.DataFrame: return self.get_mutualfund_holders() @property - def dividends(self): + def dividends(self) -> _pd.Series: return self.get_dividends() @property - def splits(self): + def splits(self) -> _pd.Series: return self.get_splits() @property - def actions(self): + def actions(self) -> _pd.DataFrame: return self.get_actions() @property - def shares(self): + def shares(self) -> _pd.DataFrame : return self.get_shares() @property - def info(self): + def info(self) -> dict: return self.get_info() @property - def calendar(self): + def calendar(self) -> _pd.DataFrame: return self.get_calendar() @property @@ -142,43 +142,43 @@ def recommendations(self): return self.get_recommendations() @property - def earnings(self): + def earnings(self) -> _pd.DataFrame: return self.get_earnings() @property - def quarterly_earnings(self): + def quarterly_earnings(self) -> _pd.DataFrame: return self.get_earnings(freq='quarterly') @property - def income_stmt(self): + def income_stmt(self) -> _pd.DataFrame: return self.get_income_stmt() @property - def quarterly_income_stmt(self): + def quarterly_income_stmt(self) -> _pd.DataFrame: return self.get_income_stmt(freq='quarterly') @property - def balance_sheet(self): + def balance_sheet(self) -> _pd.DataFrame: return self.get_balance_sheet() @property - def quarterly_balance_sheet(self): + def quarterly_balance_sheet(self) -> _pd.DataFrame: return self.get_balance_sheet(freq='quarterly') @property - def balancesheet(self): + def balancesheet(self) -> _pd.DataFrame: return self.balance_sheet @property - def quarterly_balancesheet(self): + def quarterly_balancesheet(self) -> _pd.DataFrame: return self.quarterly_balance_sheet @property - def cashflow(self): + def cashflow(self) -> _pd.DataFrame: return self.get_cashflow(freq="yearly") @property - def quarterly_cashflow(self): + def quarterly_cashflow(self) -> _pd.DataFrame: return self.get_cashflow(freq='quarterly') @property @@ -186,19 +186,19 @@ def recommendations_summary(self): return self.get_recommendations_summary() @property - def analyst_price_target(self): + def analyst_price_target(self) -> _pd.DataFrame: return self.get_analyst_price_target() @property - def revenue_forecasts(self): + def revenue_forecasts(self) -> _pd.DataFrame: return self.get_rev_forecast() @property - def sustainability(self): + def sustainability(self) -> _pd.DataFrame: return self.get_sustainability() @property - def options(self): + def options(self) -> tuple: if not self._expirations: self._download_options() return tuple(self._expirations.keys()) @@ -208,17 +208,17 @@ def news(self): return self.get_news() @property - def earnings_trend(self): + def earnings_trend(self) -> _pd.DataFrame: return self.get_earnings_trend() @property - def earnings_history(self): + def earnings_history(self) -> _pd.DataFrame: return self.get_earnings_history() @property - def earnings_dates(self): + def earnings_dates(self) -> _pd.DataFrame: return self.get_earnings_dates() @property - def earnings_forecasts(self): + def earnings_forecasts(self) -> _pd.DataFrame: return self.get_earnings_forecast() From fa7d743826da2bfa0e7d05f67487b8fb93e8cc29 Mon Sep 17 00:00:00 2001 From: Fredrik Corneliusson Date: Sun, 13 Nov 2022 21:26:34 +0100 Subject: [PATCH 05/41] Do persist requests_cache in tests. --- tests/prices.py | 2 +- tests/ticker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/prices.py b/tests/prices.py index 92718b057..788a8a224 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -15,7 +15,7 @@ class TestPriceHistory(unittest.TestCase): @classmethod def setUpClass(cls): - cls.session = requests_cache.CachedSession() + cls.session = requests_cache.CachedSession(backend='memory') @classmethod def tearDownClass(cls): diff --git a/tests/ticker.py b/tests/ticker.py index dd2e990fe..9427d0c62 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -29,7 +29,7 @@ class TestTicker(unittest.TestCase): @classmethod def setUpClass(cls): - cls.session = requests_cache.CachedSession() + cls.session = requests_cache.CachedSession(backend='memory') @classmethod def tearDownClass(cls): From 0f433d7e5d407ae667e47525a93d302853f0046e Mon Sep 17 00:00:00 2001 From: Thirumalai Raj R Date: Mon, 14 Nov 2022 15:01:27 +0530 Subject: [PATCH 06/41] Add capital gains data only for MutualFund and ETF --- yfinance/base.py | 35 +++++++++++++++++++++++++---------- yfinance/utils.py | 4 ++-- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index cc1393593..26ec78b82 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -190,7 +190,6 @@ def history(self, period="1mo", interval="1d", params["interval"] = interval.lower() params["includePrePost"] = prepost - params["events"] = "div,splits,capitalGains" # 1) fix weired bug with Yahoo! - returning 60m for 30m bars if params["interval"] == "30m": @@ -202,6 +201,18 @@ def history(self, period="1mo", interval="1d", proxy = proxy["https"] proxy = {"https": proxy} + #if the ticker is MUTUALFUND or ETF, then get capitalGains events + self._get_info(proxy) + data = self._info + is_capital_gains_data_supported = False + if 'quoteType' in data and data['quoteType'] in ('MUTUALFUND', 'ETF'): + is_capital_gains_data_supported = True + + if is_capital_gains_data_supported: + params["events"] = "div,splits,capitalGains" + else: + params["events"] = "div,splits" + # Getting data from json url = "{}/v8/finance/chart/{}".format(self._base_url, self.ticker) @@ -324,12 +335,15 @@ def history(self, period="1mo", interval="1d", quotes['Volume'] = quotes['Volume'].fillna(0).astype(_np.int64) # actions - dividends, splits, capital_gains = utils.parse_actions(data["chart"]["result"][0]) + dividends, splits, capital_gains = utils.parse_actions( + data["chart"]["result"][0], + is_capital_gains_data_supported + ) if start is not None: startDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(start)) if dividends is not None: dividends = dividends[dividends.index>=startDt] - if capital_gains is not None: + if is_capital_gains_data_supported and capital_gains is not None: capital_gains = capital_gains[capital_gains.index>=startDt] if splits is not None: splits = splits[splits.index>=startDt] @@ -337,7 +351,7 @@ def history(self, period="1mo", interval="1d", endDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(end)) if dividends is not None: dividends = dividends[dividends.index 0: - df = utils.safe_merge_dfs(df, capital_gains, interval) - if "Capital Gains" in df.columns: - df.loc[df["Capital Gains"].isna(),"Capital Gains"] = 0 - else: - df["Capital Gains"] = 0.0 + if is_capital_gains_data_supported: + if capital_gains.shape[0] > 0: + df = utils.safe_merge_dfs(df, capital_gains, interval) + if "Capital Gains" in df.columns: + df.loc[df["Capital Gains"].isna(),"Capital Gains"] = 0 + else: + df["Capital Gains"] = 0.0 if params["interval"][-1] in ("m",'h'): df.index.name = "Datetime" diff --git a/yfinance/utils.py b/yfinance/utils.py index 2597c8bc7..8990face5 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -233,7 +233,7 @@ def parse_quotes(data): return quotes -def parse_actions(data): +def parse_actions(data, is_capital_gains_data_supported): dividends = _pd.DataFrame( columns=["Dividends"], index=_pd.DatetimeIndex([])) capital_gains = _pd.DataFrame( @@ -251,7 +251,7 @@ def parse_actions(data): dividends.columns = ["Dividends"] - if "capitalGains" in data["events"]: + if is_capital_gains_data_supported and "capitalGains" in data["events"]: capital_gains = _pd.DataFrame( data=list(data["events"]["capitalGains"].values())) capital_gains.set_index("date", inplace=True) From c56e3496dbc6a701c4bcb94787acda7e2928b32d Mon Sep 17 00:00:00 2001 From: Fredrik Corneliusson Date: Mon, 14 Nov 2022 22:03:36 +0100 Subject: [PATCH 07/41] Align requirements.txt file with setup.py package dependencies. --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 28964912b..d63896ebb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pandas>=0.24.0 +pandas>=1.1.0 numpy>=1.16.5 requests>=2.26 multitasking>=0.0.7 @@ -7,4 +7,4 @@ appdirs>=1.4.4 pytz>=2022.5 frozendict>=2.3.4 beautifulsoup4>=4.11.1 -html5lib>=1.1 \ No newline at end of file +html5lib>=1.1 From 6067d2a59081f12eacc852842b1338d9a51251ce Mon Sep 17 00:00:00 2001 From: Thirumalai Raj R Date: Tue, 15 Nov 2022 14:23:31 +0530 Subject: [PATCH 08/41] Addressing PR review comments --- yfinance/base.py | 22 +++++++++------------- yfinance/utils.py | 5 ++--- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 2fc2535d7..74c06440e 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -189,11 +189,7 @@ def history(self, period="1mo", interval="1d", #if the ticker is MUTUALFUND or ETF, then get capitalGains events self._get_info(proxy) data = self._info - is_capital_gains_data_supported = False if 'quoteType' in data and data['quoteType'] in ('MUTUALFUND', 'ETF'): - is_capital_gains_data_supported = True - - if is_capital_gains_data_supported: params["events"] = "div,splits,capitalGains" else: params["events"] = "div,splits" @@ -320,15 +316,12 @@ def history(self, period="1mo", interval="1d", quotes['Volume'] = quotes['Volume'].fillna(0).astype(_np.int64) # actions - dividends, splits, capital_gains = utils.parse_actions( - data["chart"]["result"][0], - is_capital_gains_data_supported - ) + dividends, splits, capital_gains = utils.parse_actions(data["chart"]["result"][0]) if start is not None: startDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(start)) if dividends is not None: dividends = dividends[dividends.index>=startDt] - if is_capital_gains_data_supported and capital_gains is not None: + if "capitalGains" in params["events"] and capital_gains is not None: capital_gains = capital_gains[capital_gains.index>=startDt] if splits is not None: splits = splits[splits.index >= startDt] @@ -336,7 +329,7 @@ def history(self, period="1mo", interval="1d", endDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(end)) if dividends is not None: dividends = dividends[dividends.index 0: df = utils.safe_merge_dfs(df, capital_gains, interval) if "Capital Gains" in df.columns: @@ -1195,8 +1188,11 @@ def get_splits(self, proxy=None): def get_actions(self, proxy=None): if self._history is None: self.history(period="max", proxy=proxy) - if self._history is not None and "Dividends" in self._history and "Stock Splits" in self._history and "Capital Gains" in self._history: - actions = self._history[["Dividends", "Stock Splits", "Capital Gains"]] + if self._history is not None and "Dividends" in self._history and "Stock Splits" in self._history: + action_columns = ["Dividends", "Stock Splits"] + if "Capital Gains" in self._history: + action_columns.append("Capital Gains") + actions = self._history[action_columns] return actions[actions != 0].dropna(how='all').fillna(0) return [] diff --git a/yfinance/utils.py b/yfinance/utils.py index 3a6337f27..da0afac68 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -329,7 +329,7 @@ def parse_quotes(data): return quotes -def parse_actions(data, is_capital_gains_data_supported): +def parse_actions(data): dividends = _pd.DataFrame( columns=["Dividends"], index=_pd.DatetimeIndex([])) capital_gains = _pd.DataFrame( @@ -344,10 +344,9 @@ def parse_actions(data, is_capital_gains_data_supported): dividends.set_index("date", inplace=True) dividends.index = _pd.to_datetime(dividends.index, unit="s") dividends.sort_index(inplace=True) - dividends.columns = ["Dividends"] - if is_capital_gains_data_supported and "capitalGains" in data["events"]: + if "capitalGains" in data["events"]: capital_gains = _pd.DataFrame( data=list(data["events"]["capitalGains"].values())) capital_gains.set_index("date", inplace=True) From c6f760e61c07d205fe071f023830e766ed11014c Mon Sep 17 00:00:00 2001 From: Fredrik Corneliusson Date: Tue, 15 Nov 2022 22:43:58 +0100 Subject: [PATCH 09/41] Fixed #1172 - exception if tz cache file was empty --- yfinance/utils.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/yfinance/utils.py b/yfinance/utils.py index 689238f6b..d82432eae 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -721,12 +721,17 @@ def tz_db(self): def _migrate_cache_tkr_tz(self): """Migrate contents from old ticker CSV-cache to SQLite db""" - fp = _os.path.join(self._db_dir, "tkr-tz.csv") - if not _os.path.isfile(fp): + old_cache_file_path = _os.path.join(self._db_dir, "tkr-tz.csv") + + if not _os.path.isfile(old_cache_file_path): return None - df = _pd.read_csv(fp, index_col="Ticker") - self.tz_db.bulk_set(df.to_dict()['Tz']) - _os.remove(fp) + try: + df = _pd.read_csv(old_cache_file_path, index_col="Ticker") + except _pd.errors.EmptyDataError: + _os.remove(old_cache_file_path) + else: + self.tz_db.bulk_set(df.to_dict()['Tz']) + _os.remove(old_cache_file_path) class _TzCacheDummy: From 2970d9460f4e5ce515fd99d189411577ce411169 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Wed, 16 Nov 2022 12:34:36 +0000 Subject: [PATCH 10/41] Fix localizing midnight when non-existent (DST) #1174 --- yfinance/base.py | 6 +++--- yfinance/utils.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index cb6f9c293..acb8e4d57 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -325,11 +325,11 @@ def history(self, period="1mo", interval="1d", if not intraday: # If localizing a midnight during DST transition hour when clocks roll back, # meaning clock hits midnight twice, then use the 2nd (ambiguous=True) - quotes.index = _pd.to_datetime(quotes.index.date).tz_localize(tz_exchange, ambiguous=True) + quotes.index = _pd.to_datetime(quotes.index.date).tz_localize(tz_exchange, ambiguous=True, nonexistent='shift_forward') if dividends.shape[0] > 0: - dividends.index = _pd.to_datetime(dividends.index.date).tz_localize(tz_exchange, ambiguous=True) + dividends.index = _pd.to_datetime(dividends.index.date).tz_localize(tz_exchange, ambiguous=True, nonexistent='shift_forward') if splits.shape[0] > 0: - splits.index = _pd.to_datetime(splits.index.date).tz_localize(tz_exchange, ambiguous=True) + splits.index = _pd.to_datetime(splits.index.date).tz_localize(tz_exchange, ambiguous=True, nonexistent='shift_forward') # Combine df = quotes.sort_index() diff --git a/yfinance/utils.py b/yfinance/utils.py index d82432eae..a19ca7c4e 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -472,7 +472,7 @@ def _reindex_events(df, new_index, data_col_name): new_index = None if new_index is not None: - new_index = new_index.tz_localize(df.index.tz, ambiguous=True) + new_index = new_index.tz_localize(df.index.tz, ambiguous=True, nonexistent='shift_forward') df_sub = _reindex_events(df_sub, new_index, data_col) df = df_main.join(df_sub) From c5c156732117768546398c19383340deea12b831 Mon Sep 17 00:00:00 2001 From: Thirumalai Raj R Date: Thu, 17 Nov 2022 19:28:13 +0530 Subject: [PATCH 11/41] Handle non existant columns while downloading --- yfinance/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yfinance/base.py b/yfinance/base.py index 74c06440e..0f2497d41 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -382,7 +382,8 @@ def history(self, period="1mo", interval="1d", df = df[~df.index.duplicated(keep='first')] self._history = df.copy() if not actions: - df = df.drop(columns=["Dividends", "Stock Splits", "Capital Gains"]) + columns_to_be_dropped = ["Dividends", "Stock Splits", "Capital Gains"] + df = df.drop([colname for colname in columns_to_be_dropped if colname in df.columns], axis=1) if not keepna: mask_nan_or_zero = (df.isna() | (df == 0)).all(axis=1) df = df.drop(mask_nan_or_zero.index[mask_nan_or_zero]) From ca27d070f071b8d04f6ca2bf31edf40057492bd0 Mon Sep 17 00:00:00 2001 From: Thirumalai Raj R Date: Thu, 17 Nov 2022 19:39:57 +0530 Subject: [PATCH 12/41] Migrating to get_info function --- yfinance/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index ae8abe1c3..db6d35c07 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -178,8 +178,7 @@ def history(self, period="1mo", interval="1d", proxy = {"https": proxy} #if the ticker is MUTUALFUND or ETF, then get capitalGains events - self._get_info(proxy) - data = self._info + data = self.get_info(proxy) if 'quoteType' in data and data['quoteType'] in ('MUTUALFUND', 'ETF'): params["events"] = "div,splits,capitalGains" else: From e6211896f7af95bcf7081dc0cd086a09d127683e Mon Sep 17 00:00:00 2001 From: Fredrik Corneliusson Date: Fri, 18 Nov 2022 15:35:14 +0100 Subject: [PATCH 13/41] Add glob try/except in threaded implementation. Needed as current thead implementation breaks if exception is raised. --- yfinance/multi.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/yfinance/multi.py b/yfinance/multi.py index 9a36df2df..7d2d9dbac 100644 --- a/yfinance/multi.py +++ b/yfinance/multi.py @@ -199,10 +199,16 @@ def _download_one_threaded(ticker, start=None, end=None, actions=False, progress=True, period="max", interval="1d", prepost=False, proxy=None, keepna=False, rounding=False, timeout=10): - data = _download_one(ticker, start, end, auto_adjust, back_adjust, repair, - actions, period, interval, prepost, proxy, rounding, - keepna, timeout) - shared._DFS[ticker.upper()] = data + try: + data = _download_one(ticker, start, end, auto_adjust, back_adjust, repair, + actions, period, interval, prepost, proxy, rounding, + keepna, timeout) + except Exception as e: + # glob try/except needed as current thead implementation breaks if exception is raised. + shared._DFS[ticker] = utils.empty_df() + shared._ERRORS[ticker] = repr(e) + else: + shared._DFS[ticker.upper()] = data if progress: shared._PROGRESS_BAR.animate() From 06640102f8d6fc4506f694c3c8b3cb7ed8e68b73 Mon Sep 17 00:00:00 2001 From: Fredrik Corneliusson Date: Sun, 20 Nov 2022 02:04:35 +0100 Subject: [PATCH 14/41] Made fetching earnings_dates faster Avoid unnecessary request when fetching earnings_date. Added support to limit argument to only fetch as many as needed. --- tests/ticker.py | 10 ++++++++++ yfinance/base.py | 25 ++++++++++++++++++------- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/tests/ticker.py b/tests/ticker.py index 9427d0c62..cd07c0e8e 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -207,6 +207,16 @@ def test_earnings_trend(self): data_cached = self.ticker.earnings_trend self.assertIs(data, data_cached, "data not cached") + def test_earnings_dates_with_limit(self): + limit = 5 + data = self.ticker.get_earnings_dates(limit=limit) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + self.assertEqual(len(data), limit, "Wrong number or rows") + + data_cached = self.ticker.get_earnings_dates(limit=limit) + self.assertIs(data, data_cached, "data not cached") + class TestTickerHolders(unittest.TestCase): diff --git a/yfinance/base.py b/yfinance/base.py index cb6f9c293..de070220a 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -59,7 +59,7 @@ def __init__(self, ticker, session=None): self._news = [] self._shares = None - self._earnings_dates = None + self._earnings_dates = {} self._earnings_history = None self._earnings = None @@ -847,11 +847,18 @@ def get_news(self, proxy=None): self._news = data.get("news", []) return self._news - def get_earnings_dates(self, proxy=None) -> Optional[pd.DataFrame]: - if self._earnings_dates is not None: - return self._earnings_dates + def get_earnings_dates(self, limit=1000, proxy=None) -> Optional[pd.DataFrame]: + """ + Get earning dates (future and historic) + :param limit: max amount of upcoming and recent earnings dates, set to smaller value to reduce amount + of requests needed if ticker has a long history that is not of interest. + :param proxy: requests proxy to use. + :return: pandas dataframe + """ + if self._earnings_dates and limit in self._earnings_dates: + return self._earnings_dates[limit] - page_size = 100 # YF caps at 100, don't go higher + page_size = min(limit, 100) # YF caps at 100, don't go higher page_offset = 0 dates = None while True: @@ -874,12 +881,16 @@ def get_earnings_dates(self, proxy=None) -> Optional[pd.DataFrame]: # Actually YF was successful, problem is company doesn't have earnings history dates = utils.empty_earnings_dates_df() break - if dates is None: dates = data else: dates = _pd.concat([dates, data], axis=0) + page_offset += page_size + # got less data then we asked for or already fetched all we requested, no need to fetch more pages + if len(data) > page_size or len(dates) >= limit: + dates = dates.iloc[:limit] + break if dates is None or dates.shape[0] == 0: err_msg = "No earnings dates found, symbol may be delisted" @@ -916,7 +927,7 @@ def get_earnings_dates(self, proxy=None) -> Optional[pd.DataFrame]: dates = dates.set_index("Earnings Date") - self._earnings_dates = dates + self._earnings_dates[limit] = dates return dates From 80c659be71cf378827c5fb57a74e302b6ef1fb05 Mon Sep 17 00:00:00 2001 From: Thirumalai Raj R Date: Mon, 21 Nov 2022 17:12:15 +0530 Subject: [PATCH 15/41] Addressing PR comments --- README.md | 4 ---- yfinance/base.py | 3 +-- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/README.md b/README.md index 9d720f690..a25710482 100644 --- a/README.md +++ b/README.md @@ -72,10 +72,6 @@ msft.splits # show capital gains (for mutual funds & etfs) msft.capital_gains -# show financials -msft.financials -msft.quarterly_financials - # show share count msft.shares diff --git a/yfinance/base.py b/yfinance/base.py index db6d35c07..aa163fe8c 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -374,8 +374,7 @@ def history(self, period="1mo", interval="1d", df = df[~df.index.duplicated(keep='first')] self._history = df.copy() if not actions: - columns_to_be_dropped = ["Dividends", "Stock Splits", "Capital Gains"] - df = df.drop([colname for colname in columns_to_be_dropped if colname in df.columns], axis=1) + df = df.drop(columns=["Dividends", "Stock Splits", "Capital Gains"], errors='ignore') if not keepna: mask_nan_or_zero = (df.isna() | (df == 0)).all(axis=1) df = df.drop(mask_nan_or_zero.index[mask_nan_or_zero]) From 20680b0e38cb75e4f92573e5e619beebaa566a69 Mon Sep 17 00:00:00 2001 From: Fredrik Corneliusson Date: Sun, 20 Nov 2022 16:38:49 +0100 Subject: [PATCH 16/41] Lowered get earnings_dates limit and removed earnings_history API. earnings_history/get_earnings_history was redundant as it was an incomplete implementation of get_earnings_dates(). --- README.md | 3 ++- tests/ticker.py | 18 ++++++------------ yfinance/base.py | 40 +++++++++------------------------------- yfinance/ticker.py | 4 ---- 4 files changed, 17 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 94728b8ee..7d5ac45c0 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,8 @@ mfst.earnings_trend # show next event (earnings, etc) msft.calendar -# show all earnings dates +# Show future and historic earnings dates, returns at most next 4 quarters and last 8 quarters by default. +# Note: If more are needed use msft.get_earnings_dates(limit=XX) with increased limit argument. msft.earnings_dates # show ISIN code - *experimental* diff --git a/tests/ticker.py b/tests/ticker.py index cd07c0e8e..55642be10 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -159,14 +159,6 @@ def setUp(self): def tearDown(self): self.ticker = None - def test_earnings_history(self): - data = self.ticker.earnings_history - self.assertIsInstance(data, pd.DataFrame, "data has wrong type") - self.assertFalse(data.empty, "data is empty") - - data_cached = self.ticker.earnings_history - self.assertIs(data, data_cached, "data not cached") - def test_earnings(self): data = self.ticker.earnings self.assertIsInstance(data, pd.DataFrame, "data has wrong type") @@ -208,13 +200,15 @@ def test_earnings_trend(self): self.assertIs(data, data_cached, "data not cached") def test_earnings_dates_with_limit(self): - limit = 5 - data = self.ticker.get_earnings_dates(limit=limit) + # use ticker with lots of historic earnings + ticker = yf.Ticker("IBM") + limit = 110 + data = ticker.get_earnings_dates(limit=limit) self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") self.assertEqual(len(data), limit, "Wrong number or rows") - data_cached = self.ticker.get_earnings_dates(limit=limit) + data_cached = ticker.get_earnings_dates(limit=limit) self.assertIs(data, data_cached, "data not cached") @@ -392,7 +386,7 @@ def test_info(self): self.assertEqual("GOOGL", data["symbol"], "Wrong symbol value in info dict") def test_bad_freq_value_raises_exception(self): - self.assertRaises(ValueError, lambda : self.ticker.get_cashflow(freq="badarg")) + self.assertRaises(ValueError, lambda: self.ticker.get_cashflow(freq="badarg")) def suite(): diff --git a/yfinance/base.py b/yfinance/base.py index de070220a..633c43d89 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -60,7 +60,6 @@ def __init__(self, ticker, session=None): self._shares = None self._earnings_dates = {} - self._earnings_history = None self._earnings = None self._financials = None @@ -847,11 +846,13 @@ def get_news(self, proxy=None): self._news = data.get("news", []) return self._news - def get_earnings_dates(self, limit=1000, proxy=None) -> Optional[pd.DataFrame]: + def get_earnings_dates(self, limit=12, proxy=None) -> Optional[pd.DataFrame]: """ Get earning dates (future and historic) - :param limit: max amount of upcoming and recent earnings dates, set to smaller value to reduce amount - of requests needed if ticker has a long history that is not of interest. + :param limit: max amount of upcoming and recent earnings dates to return. + Default value 12 should return next 4 quarters and last 8 quarters. + Increase if more history is needed. + :param proxy: requests proxy to use. :return: pandas dataframe """ @@ -888,9 +889,12 @@ def get_earnings_dates(self, limit=1000, proxy=None) -> Optional[pd.DataFrame]: page_offset += page_size # got less data then we asked for or already fetched all we requested, no need to fetch more pages - if len(data) > page_size or len(dates) >= limit: + if len(data) < page_size or len(dates) >= limit: dates = dates.iloc[:limit] break + else: + # do not fetch more than needed next time + page_size = min(limit - len(dates), page_size) if dates is None or dates.shape[0] == 0: err_msg = "No earnings dates found, symbol may be delisted" @@ -930,29 +934,3 @@ def get_earnings_dates(self, limit=1000, proxy=None) -> Optional[pd.DataFrame]: self._earnings_dates[limit] = dates return dates - - def get_earnings_history(self, proxy=None) -> Optional[pd.DataFrame]: - if self._earnings_history is not None: - return self._earnings_history - - url = "{}/calendar/earnings?symbol={}".format(_ROOT_URL_, self.ticker) - data = self._data.get(url=url, proxy=proxy).text - - if "Will be right back" in data: - raise RuntimeError("*** YAHOO! FINANCE IS CURRENTLY DOWN! ***\n" - "Our engineers are working quickly to resolve " - "the issue. Thank you for your patience.") - - try: - # read_html returns a list of pandas Dataframes of all the tables in `data` - data = _pd.read_html(data)[0] - data.replace("-", _np.nan, inplace=True) - - data['EPS Estimate'] = _pd.to_numeric(data['EPS Estimate']) - data['Reported EPS'] = _pd.to_numeric(data['Reported EPS']) - self._earnings_history = data - # if no tables are found a ValueError is thrown - except ValueError: - print("Could not find earnings history data for {}.".format(self.ticker)) - return - return data diff --git a/yfinance/ticker.py b/yfinance/ticker.py index 0561ca891..08d059c08 100644 --- a/yfinance/ticker.py +++ b/yfinance/ticker.py @@ -211,10 +211,6 @@ def news(self): def earnings_trend(self) -> _pd.DataFrame: return self.get_earnings_trend() - @property - def earnings_history(self) -> _pd.DataFrame: - return self.get_earnings_history() - @property def earnings_dates(self) -> _pd.DataFrame: return self.get_earnings_dates() From bd3569367e0c6064336346da84bebed6ee427612 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Mon, 21 Nov 2022 17:04:15 +0000 Subject: [PATCH 17/41] Bugfix for PR #1166 --- yfinance/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yfinance/base.py b/yfinance/base.py index aa163fe8c..085ac0708 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -179,7 +179,7 @@ def history(self, period="1mo", interval="1d", #if the ticker is MUTUALFUND or ETF, then get capitalGains events data = self.get_info(proxy) - if 'quoteType' in data and data['quoteType'] in ('MUTUALFUND', 'ETF'): + if not data is None and 'quoteType' in data and data['quoteType'] in ('MUTUALFUND', 'ETF'): params["events"] = "div,splits,capitalGains" else: params["events"] = "div,splits" From 566a38b432803a70e34a1dded23319d5f0cc2f55 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Tue, 22 Nov 2022 21:46:04 +0000 Subject: [PATCH 18/41] Fix financials index formatting --- yfinance/scrapers/fundamentals.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yfinance/scrapers/fundamentals.py b/yfinance/scrapers/fundamentals.py index a0cc9cc56..0c5ae57bd 100644 --- a/yfinance/scrapers/fundamentals.py +++ b/yfinance/scrapers/fundamentals.py @@ -227,4 +227,6 @@ def get_financials_time_series(self, timescale, keys: list, proxy=None) -> pd.Da df = df.reindex([k for k in keys if k in df.index]) df = df[sorted(df.columns, reverse=True)] + df.index = utils.camel2title(df.index) + return df From ab1476c0d17d6cbc5c773541436f86b8fa917a35 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Tue, 22 Nov 2022 21:46:26 +0000 Subject: [PATCH 19/41] Restore financials nesting code (commented) --- yfinance/scrapers/fundamentals.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/yfinance/scrapers/fundamentals.py b/yfinance/scrapers/fundamentals.py index 0c5ae57bd..58f9b4f56 100644 --- a/yfinance/scrapers/fundamentals.py +++ b/yfinance/scrapers/fundamentals.py @@ -150,6 +150,15 @@ def _create_financials_table(self, name, timescale, proxy): # visible on Yahoo website. But more work needed to make it user-friendly! Ideally # return a tree data structure instead of Pandas MultiIndex # So until this is implemented, just return simple tables + # + # _fin_data = self._data.get_json_data_stores(name, proxy) + # data = _fin_data['FinancialTemplateStore'] + # financials_template_ttm_order, financials_template_annual_order, financials_template_order, financials_level_detail = utils.build_template(data) + # # Grab the raw financial details (this can be later combined with the financial template store detail to correctly order and present the data). + # data = _fin_data['QuoteTimeSeriesStore'] + # TTM_dicts, Annual_dicts = utils.retreive_financial_details(data) + # _nested_table = utils.format_annual_financial_statement(financials_level_detail, Annual_dicts, financials_template_annual_order, TTM_dicts, financials_template_ttm_order) + return self.get_financials_time_series(timescale, keys, proxy) except Exception as e: From b3b36c5cc9ebd249db79eb1323a50fecef25615b Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Tue, 22 Nov 2022 22:17:07 +0000 Subject: [PATCH 20/41] Restore old financials as backup if new missing --- yfinance/scrapers/fundamentals.py | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/yfinance/scrapers/fundamentals.py b/yfinance/scrapers/fundamentals.py index a0cc9cc56..f89ca7803 100644 --- a/yfinance/scrapers/fundamentals.py +++ b/yfinance/scrapers/fundamentals.py @@ -2,6 +2,7 @@ import json import pandas as pd +import numpy as np from yfinance import utils from yfinance.data import TickerData @@ -132,6 +133,13 @@ def _scrape(self, name, timescale, proxy=None): try: statement = self._create_financials_table(name, timescale, proxy) + + if statement.shape[0] == 0: + # Normally table only empty when nothing on Yahoo. So good? + # Except 'QuoteSummaryStore' still contains the old financial data, + # is it useful to return? + statement = self._create_financials_table_old(name, timescale, proxy) + if statement is not None: return statement except YFianceException as e: @@ -155,6 +163,34 @@ def _create_financials_table(self, name, timescale, proxy): except Exception as e: pass + def _create_financials_table_old(self, name, timescale, proxy): + data_stores = self._data.get_json_data_stores(name, proxy) + + # Fetch raw data + data = data_stores["QuoteSummaryStore"] + key1 = name.replace('-','') + "StatementHistory" + if timescale == "quarterly": + key1 += "Quarterly" + key2 = name.replace('-','') + "Statements" + data = data.get(key1)[key2] + + # Tabulate + df = pd.DataFrame(data).drop(columns=['maxAge']) + for col in df.columns: + df[col] = df[col].replace('-', np.nan) + df.set_index('endDate', inplace=True) + try: + df.index = pd.to_datetime(df.index, unit='s') + except ValueError: + df.index = pd.to_datetime(df.index) + df = df.T + df.columns.name = '' + df.index.name = 'Breakdown' + # rename incorrect yahoo key + df.rename(index={'treasuryStock': 'Gains Losses Not Affecting Retained Earnings'}, inplace=True) + df.index = utils.camel2title(df.index) + return df + def _get_datastore_keys(self, sub_page, proxy) -> list: data_stores = self._data.get_json_data_stores(sub_page, proxy) From 379b87d9257c2e15de426b30ee066790e0c171ad Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Wed, 23 Nov 2022 17:45:45 +0000 Subject: [PATCH 21/41] Moved financials formatting up into get() Moved financials formatting up into get(), controlled by new 'pretty' argument. Extend camel2title() to accept different separator char and to preserve acronyms case e.g. 'EBIT' --- yfinance/base.py | 12 +++++++++--- yfinance/scrapers/fundamentals.py | 11 ----------- yfinance/utils.py | 30 ++++++++++++++++++++++++++++-- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 0c2a38c82..0adc7008f 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -757,23 +757,29 @@ def get_earnings(self, proxy=None, as_dict=False, freq="yearly"): return dict_data return data - def get_income_stmt(self, proxy=None, as_dict=False, freq="yearly"): + def get_income_stmt(self, proxy=None, as_dict=False, pretty=False, freq="yearly"): self._fundamentals.proxy = proxy data = self._fundamentals.financials.get_income(freq=freq, proxy=proxy) + if pretty: + data.index = utils.camel2title(data.index, sep=' ', acronyms=["EBIT", "EBITDA", "EPS", "NI"]) if as_dict: return data.to_dict() return data - def get_balance_sheet(self, proxy=None, as_dict=False, freq="yearly"): + def get_balance_sheet(self, proxy=None, as_dict=False, pretty=False, freq="yearly"): self._fundamentals.proxy = proxy data = self._fundamentals.financials.get_balance_sheet(freq=freq, proxy=proxy) + if pretty: + data.index = utils.camel2title(data.index, sep=' ', acronyms=["PPE"]) if as_dict: return data.to_dict() return data - def get_cashflow(self, proxy=None, as_dict=False, freq="yearly"): + def get_cashflow(self, proxy=None, as_dict=False, pretty=False, freq="yearly"): self._fundamentals.proxy = proxy data = self._fundamentals.financials.get_cash_flow(freq=freq, proxy=proxy) + if pretty: + data.index = utils.camel2title(data.index, sep=' ', acronyms=["PPE"]) if as_dict: return data.to_dict() return data diff --git a/yfinance/scrapers/fundamentals.py b/yfinance/scrapers/fundamentals.py index 58f9b4f56..a0cc9cc56 100644 --- a/yfinance/scrapers/fundamentals.py +++ b/yfinance/scrapers/fundamentals.py @@ -150,15 +150,6 @@ def _create_financials_table(self, name, timescale, proxy): # visible on Yahoo website. But more work needed to make it user-friendly! Ideally # return a tree data structure instead of Pandas MultiIndex # So until this is implemented, just return simple tables - # - # _fin_data = self._data.get_json_data_stores(name, proxy) - # data = _fin_data['FinancialTemplateStore'] - # financials_template_ttm_order, financials_template_annual_order, financials_template_order, financials_level_detail = utils.build_template(data) - # # Grab the raw financial details (this can be later combined with the financial template store detail to correctly order and present the data). - # data = _fin_data['QuoteTimeSeriesStore'] - # TTM_dicts, Annual_dicts = utils.retreive_financial_details(data) - # _nested_table = utils.format_annual_financial_statement(financials_level_detail, Annual_dicts, financials_template_annual_order, TTM_dicts, financials_template_ttm_order) - return self.get_financials_time_series(timescale, keys, proxy) except Exception as e: @@ -236,6 +227,4 @@ def get_financials_time_series(self, timescale, keys: list, proxy=None) -> pd.Da df = df.reindex([k for k in keys if k in df.index]) df = df[sorted(df.columns, reverse=True)] - df.index = utils.camel2title(df.index) - return df diff --git a/yfinance/utils.py b/yfinance/utils.py index 16930d365..9811ed675 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -241,8 +241,34 @@ def format_quarterly_financial_statement(_statement, level_detail, order): return _statement -def camel2title(o): - return [_re.sub("([a-z])([A-Z])", r"\g<1> \g<2>", i).title() for i in o] +def camel2title(o, sep=' ', acronyms=None): + if not isinstance(sep, str) or len(sep) != 1: + raise Exception("camel2title() 'sep' argument must be single character not:", sep) + + if acronyms is None: + return [_re.sub("([a-z])([A-Z])", r"\g<1>{}\g<2>".format(sep), i).title() for i in o] + + # Handling acronyms requires more care. Assumes Yahoo returns acronym strings upper-case + + # Insert 'sep' between lower-then-upper-case + pat = "([a-z])([A-Z])" + rep = r"\g<1>{}\g<2>".format(sep) + o = [_re.sub(pat, rep, i) for i in o] + + # Insert 'sep' after acronyms. Assumes Yahoo returns acronym strings upper-case + if not isinstance(acronyms, (set,list)): + raise Exception("camel2title() 'acronyms' argument should be an iterable of acronym strings") + for a in acronyms: + pat = "("+a+")" + "([A-Z][a-z])" + rep = r"\g<1>{}\g<2>".format(sep) + o = [_re.sub(pat, rep, i) for i in o] + + # Apply str.title() to non-acronym words + o = [i.split(sep) for i in o] + o = [ [j.title() if not j in acronyms else j for j in i] for i in o] + o = [sep.join(i) for i in o] + + return o def _parse_user_dt(dt, exchange_tz): From dfb15e67788389a200374d5f98297b7170170fe6 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Wed, 23 Nov 2022 18:16:51 +0000 Subject: [PATCH 22/41] Unit tests for financials formatting --- tests/ticker.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/ticker.py b/tests/ticker.py index 55642be10..037a0860c 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -273,6 +273,14 @@ def test_quarterly_income_statement(self): data_cached = self.ticker.quarterly_income_stmt self.assertIs(data, data_cached, "data not cached") + def test_income_statement_formatting(self): + expected_keys = ["Total Revenue", "Basic EPS"] + data = self.ticker.get_income_stmt(pretty=True) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + def test_balance_sheet(self): expected_row = "TotalAssets" data = self.ticker.balance_sheet @@ -293,6 +301,14 @@ def test_quarterly_balance_sheet(self): data_cached = self.ticker.quarterly_balance_sheet self.assertIs(data, data_cached, "data not cached") + def test_balance_sheet_formatting(self): + expected_keys = ["Total Assets", "Net PPE"] + data = self.ticker.get_balance_sheet(pretty=True) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + def test_cashflow(self): expected_row = "OperatingCashFlow" data = self.ticker.cashflow @@ -313,6 +329,14 @@ def test_quarterly_cashflow(self): data_cached = self.ticker.quarterly_cashflow self.assertIs(data, data_cached, "data not cached") + def test_cashflow_formatting(self): + expected_keys = ["Operating Cash Flow", "Net PPE Purchase And Sale"] + data = self.ticker.get_cashflow(pretty=True) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + def test_sustainability(self): data = self.ticker.sustainability self.assertIsInstance(data, pd.DataFrame, "data has wrong type") From 3b19ef12bc94d45415b1e5a48c83aa740019ee52 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Thu, 24 Nov 2022 17:16:30 +0000 Subject: [PATCH 23/41] camel2title(): restrict acceptable inputs --- yfinance/utils.py | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/yfinance/utils.py b/yfinance/utils.py index 9811ed675..1d8363dea 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -241,34 +241,46 @@ def format_quarterly_financial_statement(_statement, level_detail, order): return _statement -def camel2title(o, sep=' ', acronyms=None): +def camel2title(strings: list[str], sep: str=' ', acronyms: list[str] | None =None) -> list[str]: + if isinstance(strings, str) or not hasattr(strings, '__iter__') or not isinstance(strings[0], str): + raise Exception("camel2title() 'strings' argument must be iterable of strings") if not isinstance(sep, str) or len(sep) != 1: - raise Exception("camel2title() 'sep' argument must be single character not:", sep) + raise Exception(f"camel2title() 'sep' argument = '{sep}' must be single character") + if _re.match("[a-zA-Z0-9]", sep): + raise Exception(f"camel2title() 'sep' argument = '{sep}' cannot be alpha-numeric") + if _re.escape(sep) != sep and sep not in {' ', '-'}: + # Permit some exceptions, I don't understand why they get escaped + raise Exception(f"camel2title() 'sep' argument = '{sep}' cannot be special character") if acronyms is None: - return [_re.sub("([a-z])([A-Z])", r"\g<1>{}\g<2>".format(sep), i).title() for i in o] + pat = "([a-z])([A-Z])" + rep = rf"\g<1>{sep}\g<2>" + return [_re.sub(pat, rep, s).title() for s in strings] # Handling acronyms requires more care. Assumes Yahoo returns acronym strings upper-case + if isinstance(acronyms, str) or not hasattr(acronyms, '__iter__') or not isinstance(acronyms[0], str): + raise Exception("camel2title() 'acronyms' argument must be iterable of strings") + for a in acronyms: + if not _re.match("^[A-Z]+$", a): + raise Exception(f"camel2title() 'acronyms' argument must only contain upper-case, but '{a}' detected") # Insert 'sep' between lower-then-upper-case pat = "([a-z])([A-Z])" - rep = r"\g<1>{}\g<2>".format(sep) - o = [_re.sub(pat, rep, i) for i in o] + rep = rf"\g<1>{sep}\g<2>" + strings = [_re.sub(pat, rep, s) for s in strings] - # Insert 'sep' after acronyms. Assumes Yahoo returns acronym strings upper-case - if not isinstance(acronyms, (set,list)): - raise Exception("camel2title() 'acronyms' argument should be an iterable of acronym strings") + # Insert 'sep' after acronyms for a in acronyms: - pat = "("+a+")" + "([A-Z][a-z])" - rep = r"\g<1>{}\g<2>".format(sep) - o = [_re.sub(pat, rep, i) for i in o] + pat = f"({a})([A-Z][a-z])" + rep = rf"\g<1>{sep}\g<2>" + strings = [_re.sub(pat, rep, s) for s in strings] # Apply str.title() to non-acronym words - o = [i.split(sep) for i in o] - o = [ [j.title() if not j in acronyms else j for j in i] for i in o] - o = [sep.join(i) for i in o] + strings = [s.split(sep) for s in strings] + strings = [ [j.title() if not j in acronyms else j for j in s] for s in strings] + strings = [sep.join(s) for s in strings] - return o + return strings def _parse_user_dt(dt, exchange_tz): From d7baa0713eefd99942817ad9fe6e22c201c0d4fb Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Fri, 25 Nov 2022 22:18:09 +0000 Subject: [PATCH 24/41] Get quote type from metadata instead info[] -> faster --- yfinance/base.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 85113631b..87fca757a 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -177,11 +177,7 @@ def history(self, period="1mo", interval="1d", proxy = {"https": proxy} #if the ticker is MUTUALFUND or ETF, then get capitalGains events - data = self.get_info(proxy) - if not data is None and 'quoteType' in data and data['quoteType'] in ('MUTUALFUND', 'ETF'): - params["events"] = "div,splits,capitalGains" - else: - params["events"] = "div,splits" + params["events"] = "div,splits,capitalGains" # Getting data from json url = "{}/v8/finance/chart/{}".format(self._base_url, self.ticker) @@ -269,6 +265,9 @@ def history(self, period="1mo", interval="1d", except Exception: pass + # Select useful info from metadata + quote_type = data["chart"]["result"][0]["meta"]["instrumentType"] + expect_capital_gains = quote_type in ('MUTUALFUND', 'ETF') tz_exchange = data["chart"]["result"][0]["meta"]["exchangeTimezoneName"] # Note: ordering is important. If you change order, run the tests! @@ -306,13 +305,16 @@ def history(self, period="1mo", interval="1d", # actions dividends, splits, capital_gains = utils.parse_actions(data["chart"]["result"][0]) + if not expect_capital_gains: + capital_gains = None + if start is not None: # Note: use pandas Timestamp as datetime.utcfromtimestamp has bugs on windows # https://github.com/python/cpython/issues/81708 startDt = _pd.Timestamp(start, unit='s') if dividends is not None: dividends = dividends[dividends.index>=startDt] - if "capitalGains" in params["events"] and capital_gains is not None: + if capital_gains is not None: capital_gains = capital_gains[capital_gains.index>=startDt] if splits is not None: splits = splits[splits.index >= startDt] @@ -320,7 +322,7 @@ def history(self, period="1mo", interval="1d", endDt = _pd.Timestamp(end, unit='s') if dividends is not None: dividends = dividends[dividends.index 0: df = utils.safe_merge_dfs(df, capital_gains, interval) if "Capital Gains" in df.columns: From e234b8c5ab687e132ba71d0884286434984b3019 Mon Sep 17 00:00:00 2001 From: Fredrik Corneliusson Date: Sun, 27 Nov 2022 19:00:45 +0100 Subject: [PATCH 25/41] #1207 Fixed regression issue with Python < 3.9 --- yfinance/utils.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/yfinance/utils.py b/yfinance/utils.py index 449017c3a..95c468c92 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -22,7 +22,7 @@ from __future__ import print_function import datetime as _datetime -from typing import Dict, Union +from typing import Dict, Union, List, Optional import pytz as _tz import requests as _requests @@ -216,7 +216,7 @@ def format_annual_financial_statement(level_detail, annual_dicts, annual_order, else: _statement = Annual - _statement.index = camel2title(_statement.T) + _statement.index = camel2title(_statement.T.index) _statement['level_detail'] = level_detail _statement = _statement.set_index([_statement.index, 'level_detail']) _statement = _statement[sorted(_statement.columns, reverse=True)] @@ -241,16 +241,16 @@ def format_quarterly_financial_statement(_statement, level_detail, order): return _statement -def camel2title(strings: list[str], sep: str=' ', acronyms: list[str] | None =None) -> list[str]: +def camel2title(strings: List[str], sep: str = ' ', acronyms: Optional[List[str]] = None) -> List[str]: if isinstance(strings, str) or not hasattr(strings, '__iter__') or not isinstance(strings[0], str): - raise Exception("camel2title() 'strings' argument must be iterable of strings") + raise TypeError("camel2title() 'strings' argument must be iterable of strings") if not isinstance(sep, str) or len(sep) != 1: - raise Exception(f"camel2title() 'sep' argument = '{sep}' must be single character") + raise ValueError(f"camel2title() 'sep' argument = '{sep}' must be single character") if _re.match("[a-zA-Z0-9]", sep): - raise Exception(f"camel2title() 'sep' argument = '{sep}' cannot be alpha-numeric") + raise ValueError(f"camel2title() 'sep' argument = '{sep}' cannot be alpha-numeric") if _re.escape(sep) != sep and sep not in {' ', '-'}: # Permit some exceptions, I don't understand why they get escaped - raise Exception(f"camel2title() 'sep' argument = '{sep}' cannot be special character") + raise ValueError(f"camel2title() 'sep' argument = '{sep}' cannot be special character") if acronyms is None: pat = "([a-z])([A-Z])" @@ -259,10 +259,10 @@ def camel2title(strings: list[str], sep: str=' ', acronyms: list[str] | None =No # Handling acronyms requires more care. Assumes Yahoo returns acronym strings upper-case if isinstance(acronyms, str) or not hasattr(acronyms, '__iter__') or not isinstance(acronyms[0], str): - raise Exception("camel2title() 'acronyms' argument must be iterable of strings") + raise TypeError("camel2title() 'acronyms' argument must be iterable of strings") for a in acronyms: if not _re.match("^[A-Z]+$", a): - raise Exception(f"camel2title() 'acronyms' argument must only contain upper-case, but '{a}' detected") + raise ValueError(f"camel2title() 'acronyms' argument must only contain upper-case, but '{a}' detected") # Insert 'sep' between lower-then-upper-case pat = "([a-z])([A-Z])" @@ -277,7 +277,7 @@ def camel2title(strings: list[str], sep: str=' ', acronyms: list[str] | None =No # Apply str.title() to non-acronym words strings = [s.split(sep) for s in strings] - strings = [ [j.title() if not j in acronyms else j for j in s] for s in strings] + strings = [[j.title() if not j in acronyms else j for j in s] for s in strings] strings = [sep.join(s) for s in strings] return strings From 37ac9bd1d5c4bd1a79d885d492a7812001f61471 Mon Sep 17 00:00:00 2001 From: Fredrik Corneliusson Date: Sun, 27 Nov 2022 19:25:08 +0100 Subject: [PATCH 26/41] #1209 Fixed pretty format alters cached dataframe --- yfinance/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yfinance/base.py b/yfinance/base.py index 0e3ec108f..bfafe1b16 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -761,6 +761,7 @@ def get_income_stmt(self, proxy=None, as_dict=False, pretty=False, freq="yearly" self._fundamentals.proxy = proxy data = self._fundamentals.financials.get_income(freq=freq, proxy=proxy) if pretty: + data = data.copy() data.index = utils.camel2title(data.index, sep=' ', acronyms=["EBIT", "EBITDA", "EPS", "NI"]) if as_dict: return data.to_dict() @@ -770,6 +771,7 @@ def get_balance_sheet(self, proxy=None, as_dict=False, pretty=False, freq="yearl self._fundamentals.proxy = proxy data = self._fundamentals.financials.get_balance_sheet(freq=freq, proxy=proxy) if pretty: + data = data.copy() data.index = utils.camel2title(data.index, sep=' ', acronyms=["PPE"]) if as_dict: return data.to_dict() @@ -779,6 +781,7 @@ def get_cashflow(self, proxy=None, as_dict=False, pretty=False, freq="yearly"): self._fundamentals.proxy = proxy data = self._fundamentals.financials.get_cash_flow(freq=freq, proxy=proxy) if pretty: + data = data.copy() data.index = utils.camel2title(data.index, sep=' ', acronyms=["PPE"]) if as_dict: return data.to_dict() From 4064ec53c3f5747087d91ae4d821d712e3bd7df6 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Sun, 27 Nov 2022 19:15:35 +0000 Subject: [PATCH 27/41] Move financials fallback logic into Ticker --- yfinance/base.py | 27 ++++-- yfinance/scrapers/fundamentals.py | 137 +++++++++++++++++++----------- 2 files changed, 110 insertions(+), 54 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 0c2a38c82..8863d5492 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -757,23 +757,38 @@ def get_earnings(self, proxy=None, as_dict=False, freq="yearly"): return dict_data return data - def get_income_stmt(self, proxy=None, as_dict=False, freq="yearly"): + def get_income_stmt(self, proxy=None, as_dict=False, freq="yearly", fallback=True): self._fundamentals.proxy = proxy - data = self._fundamentals.financials.get_income(freq=freq, proxy=proxy) + data = self._fundamentals.financials.get_income_time_series(freq=freq, proxy=proxy) + + if (data is None or data.empty) and fallback: + print(f"{self.ticker}: Yahoo not displaying {freq}-income so falling back to old table format") + data = self._fundamentals.financials.get_income_scrape(freq=freq, proxy=proxy) + if as_dict: return data.to_dict() return data - def get_balance_sheet(self, proxy=None, as_dict=False, freq="yearly"): + def get_balance_sheet(self, proxy=None, as_dict=False, freq="yearly", fallback=True): self._fundamentals.proxy = proxy - data = self._fundamentals.financials.get_balance_sheet(freq=freq, proxy=proxy) + data = self._fundamentals.financials.get_balance_sheet_time_series(freq=freq, proxy=proxy) + + if (data is None or data.empty) and fallback: + print(f"{self.ticker}: Yahoo not displaying {freq}-balance-sheet so falling back to old table format") + data = self._fundamentals.financials.get_balance_sheet_scrape(freq=freq, proxy=proxy) + if as_dict: return data.to_dict() return data - def get_cashflow(self, proxy=None, as_dict=False, freq="yearly"): + def get_cashflow(self, proxy=None, as_dict=False, freq="yearly", fallback=True): self._fundamentals.proxy = proxy - data = self._fundamentals.financials.get_cash_flow(freq=freq, proxy=proxy) + data = self._fundamentals.financials.get_cash_flow_time_series(freq=freq, proxy=proxy) + + if (data is None or data.empty) and fallback: + print(f"{self.ticker}: Yahoo not displaying {freq}-cashflow so falling back to old table format") + data = self._fundamentals.financials.get_cash_flow_scrape(freq=freq, proxy=proxy) + if as_dict: return data.to_dict() return data diff --git a/yfinance/scrapers/fundamentals.py b/yfinance/scrapers/fundamentals.py index f89ca7803..8b21da76c 100644 --- a/yfinance/scrapers/fundamentals.py +++ b/yfinance/scrapers/fundamentals.py @@ -100,29 +100,36 @@ def _scrape_shares(self, proxy): class Fiancials: def __init__(self, data: TickerData): self._data = data - self._income = {} - self._balance_sheet = {} - self._cash_flow = {} - - def get_income(self, freq="yearly", proxy=None) -> pd.DataFrame: - res = self._income + self._income_time_series = {} + self._balance_sheet_time_series = {} + self._cash_flow_time_series = {} + self._income_scraped = {} + self._balance_sheet_scraped = {} + self._cash_flow_scraped = {} + + def get_income_time_series(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._income_time_series if freq not in res: - res[freq] = self._scrape("income", freq, proxy=None) + res[freq] = self._fetch_time_series("income", freq, proxy=None) return res[freq] - def get_balance_sheet(self, freq="yearly", proxy=None) -> pd.DataFrame: - res = self._balance_sheet + def get_balance_sheet_time_series(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._balance_sheet_time_series if freq not in res: - res[freq] = self._scrape("balance-sheet", freq, proxy=None) + res[freq] = self._fetch_time_series("balance-sheet", freq, proxy=None) return res[freq] - def get_cash_flow(self, freq="yearly", proxy=None) -> pd.DataFrame: - res = self._cash_flow + def get_cash_flow_time_series(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._cash_flow_time_series if freq not in res: - res[freq] = self._scrape("cash-flow", freq, proxy=None) + res[freq] = self._fetch_time_series("cash-flow", freq, proxy=None) return res[freq] - def _scrape(self, name, timescale, proxy=None): + def _fetch_time_series(self, name, timescale, proxy=None): + # Fetching time series preferred over scraping 'QuoteSummaryStore', + # because it matches what Yahoo shows. But for some tickers returns nothing, + # despite 'QuoteSummaryStore' containing valid data. + allowed_names = ["income", "balance-sheet", "cash-flow"] allowed_timescales = ["yearly", "quarterly"] @@ -134,12 +141,6 @@ def _scrape(self, name, timescale, proxy=None): try: statement = self._create_financials_table(name, timescale, proxy) - if statement.shape[0] == 0: - # Normally table only empty when nothing on Yahoo. So good? - # Except 'QuoteSummaryStore' still contains the old financial data, - # is it useful to return? - statement = self._create_financials_table_old(name, timescale, proxy) - if statement is not None: return statement except YFianceException as e: @@ -163,34 +164,6 @@ def _create_financials_table(self, name, timescale, proxy): except Exception as e: pass - def _create_financials_table_old(self, name, timescale, proxy): - data_stores = self._data.get_json_data_stores(name, proxy) - - # Fetch raw data - data = data_stores["QuoteSummaryStore"] - key1 = name.replace('-','') + "StatementHistory" - if timescale == "quarterly": - key1 += "Quarterly" - key2 = name.replace('-','') + "Statements" - data = data.get(key1)[key2] - - # Tabulate - df = pd.DataFrame(data).drop(columns=['maxAge']) - for col in df.columns: - df[col] = df[col].replace('-', np.nan) - df.set_index('endDate', inplace=True) - try: - df.index = pd.to_datetime(df.index, unit='s') - except ValueError: - df.index = pd.to_datetime(df.index) - df = df.T - df.columns.name = '' - df.index.name = 'Breakdown' - # rename incorrect yahoo key - df.rename(index={'treasuryStock': 'Gains Losses Not Affecting Retained Earnings'}, inplace=True) - df.index = utils.camel2title(df.index) - return df - def _get_datastore_keys(self, sub_page, proxy) -> list: data_stores = self._data.get_json_data_stores(sub_page, proxy) @@ -264,3 +237,71 @@ def get_financials_time_series(self, timescale, keys: list, proxy=None) -> pd.Da df = df[sorted(df.columns, reverse=True)] return df + + def get_income_scrape(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._income_scraped + if freq not in res: + res[freq] = self._scrape("income", freq, proxy=None) + return res[freq] + + def get_balance_sheet_scrape(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._balance_sheet_scraped + if freq not in res: + res[freq] = self._scrape("income", freq, proxy=None) + return res[freq] + + def get_cash_flow_scrape(self, freq="yearly", proxy=None) -> pd.DataFrame: + res = self._cash_flow_scraped + if freq not in res: + res[freq] = self._scrape("income", freq, proxy=None) + return res[freq] + + def _scrape(self, name, timescale, proxy=None): + # Backup in case _fetch_time_series() fails to return data + + allowed_names = ["income", "balance-sheet", "cash-flow"] + allowed_timescales = ["yearly", "quarterly"] + + if name not in allowed_names: + raise ValueError("Illegal argument: name must be one of: {}".format(allowed_names)) + if timescale not in allowed_timescales: + raise ValueError("Illegal argument: timescale must be one of: {}".format(allowed_names)) + + try: + # Normally table only empty when nothing on Yahoo. So good? + # Except 'QuoteSummaryStore' still contains the old financial data, + # is it useful to return? + statement = self._create_financials_table_old(name, timescale, proxy) + + if statement is not None: + return statement + except YFianceException as e: + print("Failed to create financials table for {} reason: {}".format(name, repr(e))) + return pd.DataFrame() + + def _create_financials_table_old(self, name, timescale, proxy): + data_stores = self._data.get_json_data_stores("financials", proxy) + + # Fetch raw data + data = data_stores["QuoteSummaryStore"] + key2 = name.replace('-','') + "StatementHistory" + if timescale == "quarterly": + key1 = key2 + "Quarterly" + data = data.get(key1)[key2] + + # Tabulate + df = pd.DataFrame(data).drop(columns=['maxAge']) + for col in df.columns: + df[col] = df[col].replace('-', np.nan) + df.set_index('endDate', inplace=True) + try: + df.index = pd.to_datetime(df.index, unit='s') + except ValueError: + df.index = pd.to_datetime(df.index) + df = df.T + df.columns.name = '' + df.index.name = 'Breakdown' + # rename incorrect yahoo key + df.rename(index={'treasuryStock': 'Gains Losses Not Affecting Retained Earnings'}, inplace=True) + df.index = utils.camel2title(df.index) + return df From 5e333f53eee36127baa15fecea25c37ddc783843 Mon Sep 17 00:00:00 2001 From: Fredrik Corneliusson Date: Tue, 29 Nov 2022 01:18:59 +0100 Subject: [PATCH 28/41] #1213 Added test asserting no harmful requests are added to history call. --- tests/ticker.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/ticker.py b/tests/ticker.py index 037a0860c..b1c2eaae8 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -135,6 +135,26 @@ def setUp(self): def tearDown(self): self.ticker = None + def test_history(self): + data = self.ticker.history("1y") + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + + def test_no_expensive_calls_introduced(self): + """ + Make sure calling history to get price data has not introduced more calls to yahoo than absolutely necessary. + As doing other type of scraping calls than "query2.finance.yahoo.com/v8/finance/chart" to yahoo website + will quickly trigger spam-block when doing bulk download of history data. + """ + session = requests_cache.CachedSession(backend='memory') + ticker = yf.Ticker("GOOGL", session=session) + ticker.history("1y") + actual_urls_called = tuple(session.cache.urls) + expected_urls = ( + 'https://query2.finance.yahoo.com/v8/finance/chart/GOOGL?range=1y&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains', + ) + self.assertEquals(expected_urls, actual_urls_called, "Different than expected url used to fetch history.") + def test_dividends(self): data = self.ticker.dividends self.assertIsInstance(data, pd.Series, "data has wrong type") From 2eae33bd330881a4eb1555266fdb0673a22a36ea Mon Sep 17 00:00:00 2001 From: ymyke Date: Tue, 29 Nov 2022 23:28:16 +0100 Subject: [PATCH 29/41] Fix a couple of minor issues in README - Typos in variable name - `Ticker` doesn't support several tickers - `Tickers` doesn't return named tuple - "1m" in `download` would produce an error for longer timeframes, so changing the example to "5d" --- README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4b7af1128..b5746c3b7 100644 --- a/README.md +++ b/README.md @@ -108,9 +108,9 @@ msft.recommendations msft.recommendations_summary # show analysts other work msft.analyst_price_target -mfst.revenue_forecasts -mfst.earnings_forecasts -mfst.earnings_trend +msft.revenue_forecasts +msft.earnings_forecasts +msft.earnings_trend # show next event (earnings, etc) msft.calendar @@ -160,7 +160,7 @@ the Ticker constructor. import requests_cache session = requests_cache.CachedSession('yfinance.cache') session.headers['User-agent'] = 'my-program/1.0' -ticker = yf.Ticker('msft aapl goog', session=session) +ticker = yf.Ticker('msft', session=session) # The scraped response will be stored in the cache ticker.actions ``` @@ -171,7 +171,6 @@ To initialize multiple `Ticker` objects, use import yfinance as yf tickers = yf.Tickers('msft aapl goog') -# ^ returns a named tuple of Ticker objects # access each ticker using (example) tickers.tickers['MSFT'].info @@ -201,7 +200,7 @@ data = yf.download( # or pdr.get_data_yahoo(... # fetch data by interval (including intraday if period < 60 days) # valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo # (optional, default is '1d') - interval = "1m", + interval = "5d", # Whether to ignore timezone when aligning ticker data from # different timezones. Default is True. False may be useful for From f93c3d76ce0d72a3beef4de91cd8dc1f73c38912 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Wed, 30 Nov 2022 16:58:03 +0000 Subject: [PATCH 30/41] Extend Yahoo duplication fix to intra-day --- tests/prices.py | 20 ++++++++++++++++++++ yfinance/utils.py | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/tests/prices.py b/tests/prices.py index 788a8a224..0b43248dc 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -36,6 +36,26 @@ def test_daily_index(self): f = df.index.time == _dt.time(0) self.assertTrue(f.all()) + def test_duplicatingHourly(self): + tkrs = ["IMP.JO", "BHG.JO", "SSW.JO", "BP.L", "INTC"] + for tkr in tkrs: + dat = yf.Ticker(tkr, session=self.session) + tz = dat._get_ticker_tz(debug_mode=False, proxy=None, timeout=None) + + dt_utc = _tz.timezone("UTC").localize(_dt.datetime.utcnow()) + dt = dt_utc.astimezone(_tz.timezone(tz)) + + df = dat.history(start=dt.date() - _dt.timedelta(days=1), interval="1h") + + dt0 = df.index[-2] + dt1 = df.index[-1] + try: + self.assertNotEqual(dt0.hour, dt1.hour) + except: + print("Ticker = ", tkr) + raise + + def test_duplicatingDaily(self): tkrs = ["IMP.JO", "BHG.JO", "SSW.JO", "BP.L", "INTC"] test_run = False diff --git a/yfinance/utils.py b/yfinance/utils.py index 95c468c92..37c5a7848 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -440,7 +440,7 @@ def fix_Yahoo_returning_live_separate(quotes, interval, tz_exchange): elif interval == "3mo": last_rows_same_interval = dt1.year == dt2.year and dt1.quarter == dt2.quarter else: - last_rows_same_interval = False + last_rows_same_interval = (dt1-dt2) < _pd.Timedelta(interval) if last_rows_same_interval: # Last two rows are within same interval From d963e3fe1ce849ad8d5cf65a2ba5a125406aea41 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Thu, 1 Dec 2022 15:47:37 +0000 Subject: [PATCH 31/41] Fix dev merge ; Fix financials fallback fetch --- tests/ticker.py | 47 ++++++++++++++++++++++++++++++- yfinance/base.py | 6 ++-- yfinance/scrapers/fundamentals.py | 36 ++++++++++++++++------- 3 files changed, 75 insertions(+), 14 deletions(-) diff --git a/tests/ticker.py b/tests/ticker.py index 037a0860c..ee169b1ac 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -246,9 +246,24 @@ def test_mutualfund_holders(self): class TestTickerMiscFinancials(unittest.TestCase): + session = None + + @classmethod + def setUpClass(cls): + cls.session = requests_cache.CachedSession(backend='memory') + + @classmethod + def tearDownClass(cls): + if cls.session is not None: + cls.session.close() def setUp(self): - self.ticker = yf.Ticker("GOOGL") + self.ticker = yf.Ticker("GOOGL", session=self.session) + + # For ticker 'BSE.AX' (and others), Yahoo not returning + # full quarterly financials (usually cash-flow) with all entries, + # instead returns a smaller version in different data store. + self.ticker_old_fmt = yf.Ticker("BSE.AX", session=self.session) def tearDown(self): self.ticker = None @@ -281,6 +296,16 @@ def test_income_statement_formatting(self): for k in expected_keys: self.assertIn(k, data.index, "Did not find expected row in index") + def test_quarterly_income_statement_old_fmt(self): + expected_row = "TotalRevenue" + data = self.ticker_old_fmt.quarterly_income_stmt + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + self.assertIn(expected_row, data.index, "Did not find expected row in index") + + data_cached = self.ticker_old_fmt.quarterly_income_stmt + self.assertIs(data, data_cached, "data not cached") + def test_balance_sheet(self): expected_row = "TotalAssets" data = self.ticker.balance_sheet @@ -301,6 +326,16 @@ def test_quarterly_balance_sheet(self): data_cached = self.ticker.quarterly_balance_sheet self.assertIs(data, data_cached, "data not cached") + def test_quarterly_balance_sheet_old_fmt(self): + expected_row = "TotalAssets" + data = self.ticker_old_fmt.quarterly_balance_sheet + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + self.assertIn(expected_row, data.index, "Did not find expected row in index") + + data_cached = self.ticker_old_fmt.quarterly_balance_sheet + self.assertIs(data, data_cached, "data not cached") + def test_balance_sheet_formatting(self): expected_keys = ["Total Assets", "Net PPE"] data = self.ticker.get_balance_sheet(pretty=True) @@ -329,6 +364,16 @@ def test_quarterly_cashflow(self): data_cached = self.ticker.quarterly_cashflow self.assertIs(data, data_cached, "data not cached") + def test_quarterly_cashflow_old_fmt(self): + expected_row = "NetIncome" + data = self.ticker_old_fmt.quarterly_cashflow + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + self.assertIn(expected_row, data.index, "Did not find expected row in index") + + data_cached = self.ticker_old_fmt.quarterly_cashflow + self.assertIs(data, data_cached, "data not cached") + def test_cashflow_formatting(self): expected_keys = ["Operating Cash Flow", "Net PPE Purchase And Sale"] data = self.ticker.get_cashflow(pretty=True) diff --git a/yfinance/base.py b/yfinance/base.py index e38bc63a9..30513924c 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -759,7 +759,7 @@ def get_earnings(self, proxy=None, as_dict=False, freq="yearly"): def get_income_stmt(self, proxy=None, as_dict=False, pretty=False, freq="yearly", fallback=True): self._fundamentals.proxy = proxy - data = self._fundamentals.financials.get_income(freq=freq, proxy=proxy) + data = self._fundamentals.financials.get_income_time_series(freq=freq, proxy=proxy) if (data is None or data.empty) and fallback: print(f"{self.ticker}: Yahoo not displaying {freq}-income so falling back to old table format") @@ -773,7 +773,7 @@ def get_income_stmt(self, proxy=None, as_dict=False, pretty=False, freq="yearly" def get_balance_sheet(self, proxy=None, as_dict=False, pretty=False, freq="yearly", fallback=True): self._fundamentals.proxy = proxy - data = self._fundamentals.financials.get_balance_sheet(freq=freq, proxy=proxy) + data = self._fundamentals.financials.get_balance_sheet_time_series(freq=freq, proxy=proxy) if (data is None or data.empty) and fallback: print(f"{self.ticker}: Yahoo not displaying {freq}-balance-sheet so falling back to old table format") @@ -787,7 +787,7 @@ def get_balance_sheet(self, proxy=None, as_dict=False, pretty=False, freq="yearl def get_cashflow(self, proxy=None, as_dict=False, pretty=False, freq="yearly", fallback=True): self._fundamentals.proxy = proxy - data = self._fundamentals.financials.get_cash_flow(freq=freq, proxy=proxy) + data = self._fundamentals.financials.get_cash_flow_time_series(freq=freq, proxy=proxy) if (data is None or data.empty) and fallback: print(f"{self.ticker}: Yahoo not displaying {freq}-cashflow so falling back to old table format") diff --git a/yfinance/scrapers/fundamentals.py b/yfinance/scrapers/fundamentals.py index 8b21da76c..159a5fd31 100644 --- a/yfinance/scrapers/fundamentals.py +++ b/yfinance/scrapers/fundamentals.py @@ -247,13 +247,13 @@ def get_income_scrape(self, freq="yearly", proxy=None) -> pd.DataFrame: def get_balance_sheet_scrape(self, freq="yearly", proxy=None) -> pd.DataFrame: res = self._balance_sheet_scraped if freq not in res: - res[freq] = self._scrape("income", freq, proxy=None) + res[freq] = self._scrape("balance-sheet", freq, proxy=None) return res[freq] def get_cash_flow_scrape(self, freq="yearly", proxy=None) -> pd.DataFrame: res = self._cash_flow_scraped if freq not in res: - res[freq] = self._scrape("income", freq, proxy=None) + res[freq] = self._scrape("cash-flow", freq, proxy=None) return res[freq] def _scrape(self, name, timescale, proxy=None): @@ -268,9 +268,6 @@ def _scrape(self, name, timescale, proxy=None): raise ValueError("Illegal argument: timescale must be one of: {}".format(allowed_names)) try: - # Normally table only empty when nothing on Yahoo. So good? - # Except 'QuoteSummaryStore' still contains the old financial data, - # is it useful to return? statement = self._create_financials_table_old(name, timescale, proxy) if statement is not None: @@ -283,14 +280,29 @@ def _create_financials_table_old(self, name, timescale, proxy): data_stores = self._data.get_json_data_stores("financials", proxy) # Fetch raw data + if not "QuoteSummaryStore" in data_stores: + return pd.DataFrame() data = data_stores["QuoteSummaryStore"] - key2 = name.replace('-','') + "StatementHistory" + + if name == "cash-flow": + key1 = "cashflowStatement" + key2 = "cashflowStatements" + elif name == "balance-sheet": + key1 = "balanceSheet" + key2 = "balanceSheetStatements" + else: + key1 = "incomeStatement" + key2 = "incomeStatementHistory" + key1 += "History" if timescale == "quarterly": - key1 = key2 + "Quarterly" + key1 += "Quarterly" data = data.get(key1)[key2] # Tabulate - df = pd.DataFrame(data).drop(columns=['maxAge']) + df = pd.DataFrame(data) + if len(df) == 0: + return pd.DataFrame() + df = df.drop(columns=['maxAge']) for col in df.columns: df[col] = df[col].replace('-', np.nan) df.set_index('endDate', inplace=True) @@ -302,6 +314,10 @@ def _create_financials_table_old(self, name, timescale, proxy): df.columns.name = '' df.index.name = 'Breakdown' # rename incorrect yahoo key - df.rename(index={'treasuryStock': 'Gains Losses Not Affecting Retained Earnings'}, inplace=True) - df.index = utils.camel2title(df.index) + df.rename(index={'treasuryStock': 'gainsLossesNotAffectingRetainedEarnings'}, inplace=True) + + # Upper-case first letter, leave rest unchanged: + s0 = df.index[0] + df.index = [s[0].upper()+s[1:] for s in df.index] + return df From e26a4c5a1c62c2fdb2a62e896b5650538db82732 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Thu, 1 Dec 2022 16:29:13 +0000 Subject: [PATCH 32/41] Improve handling dividends without matching price interval Tolerate merging daily dividend event without matching prices interval (just append). Move price-repair to after merge, to fix these missing prices intervals. Improve bad-price detection & repair. --- yfinance/base.py | 147 ++++++++++++++++++++++++---------------------- yfinance/utils.py | 13 ++-- 2 files changed, 84 insertions(+), 76 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index abbd5340e..53c1adff2 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -274,34 +274,6 @@ def history(self, period="1mo", interval="1d", quotes = utils.set_df_tz(quotes, params["interval"], tz_exchange) quotes = utils.fix_Yahoo_dst_issue(quotes, params["interval"]) quotes = utils.fix_Yahoo_returning_live_separate(quotes, params["interval"], tz_exchange) - if repair: - # Do this before auto/back adjust - quotes = self._fix_zero_prices(quotes, interval, tz_exchange) - quotes = self._fix_unit_mixups(quotes, interval, tz_exchange) - - # Auto/back adjust - try: - if auto_adjust: - quotes = utils.auto_adjust(quotes) - elif back_adjust: - quotes = utils.back_adjust(quotes) - except Exception as e: - if auto_adjust: - err_msg = "auto_adjust failed with %s" % e - else: - err_msg = "back_adjust failed with %s" % e - shared._DFS[self.ticker] = utils.empty_df() - shared._ERRORS[self.ticker] = err_msg - if debug: - if raise_errors: - raise Exception('%s: %s' % (self.ticker, err_msg)) - else: - print('%s: %s' % (self.ticker, err_msg)) - - if rounding: - quotes = _np.round(quotes, data[ - "chart"]["result"][0]["meta"]["priceHint"]) - quotes['Volume'] = quotes['Volume'].fillna(0).astype(_np.int64) # actions dividends, splits, capital_gains = utils.parse_actions(data["chart"]["result"][0]) @@ -366,6 +338,35 @@ def history(self, period="1mo", interval="1d", else: df["Capital Gains"] = 0.0 + if repair: + # Do this before auto/back adjust + df = self._fix_zero_prices(df, interval, tz_exchange) + df = self._fix_unit_mixups(df, interval, tz_exchange) + + # Auto/back adjust + try: + if auto_adjust: + df = utils.auto_adjust(df) + elif back_adjust: + df = utils.back_adjust(df) + except Exception as e: + if auto_adjust: + err_msg = "auto_adjust failed with %s" % e + else: + err_msg = "back_adjust failed with %s" % e + shared._DFS[self.ticker] = utils.empty_df() + shared._ERRORS[self.ticker] = err_msg + if debug: + if raise_errors: + raise Exception('%s: %s' % (self.ticker, err_msg)) + else: + print('%s: %s' % (self.ticker, err_msg)) + + if rounding: + df = _np.round(df, data[ + "chart"]["result"][0]["meta"]["priceHint"]) + df['Volume'] = df['Volume'].fillna(0).astype(_np.int64) + if intraday: df.index.name = "Datetime" else: @@ -418,48 +419,46 @@ def _reconstruct_interval(self, df_row, interval, bad_fields): new_vals = {} if sub_interval == "1h": - df_fine = self.history(start=start, end=start + td_range, interval=sub_interval, auto_adjust=False) + df_fine = self.history(start=start, end=start + td_range, interval=sub_interval, auto_adjust=False, prepost=True) else: df_fine = self.history(start=start - td_range, end=start + td_range, interval=sub_interval, auto_adjust=False) # First, check whether df_fine has different split-adjustment than df_row. - # If it is different, then adjust df_fine to match df_row + # If different, then adjust df_fine to match df_row good_fields = list(set(data_cols) - set(bad_fields) - set("Adj Close")) - if len(good_fields) == 0: - raise Exception( - "No good fields, so cannot determine whether different split-adjustment. Contact developers") - # median = df_row.loc[good_fields].median() - # median_fine = _np.median(df_fine[good_fields].values) - # ratio = median/median_fine - # Better method to calculate split-adjustment: - df_fine_from_idx = df_fine[df_fine.index >= idx] - ratios = [] - for f in good_fields: - if f == "Low": - ratios.append(df_row[f] / df_fine_from_idx[f].min()) - elif f == "High": - ratios.append(df_row[f] / df_fine_from_idx[f].max()) - elif f == "Open": - ratios.append(df_row[f] / df_fine_from_idx[f].iloc[0]) - elif f == "Close": - ratios.append(df_row[f] / df_fine_from_idx[f].iloc[-1]) - ratio = _np.mean(ratios) - # - ratio_rcp = round(1.0 / ratio, 1) - ratio = round(ratio, 1) - if ratio == 1 and ratio_rcp == 1: - # Good! - pass - else: - if ratio > 1: - # data has different split-adjustment than fine-grained data - # Adjust fine-grained to match - df_fine[data_cols] *= ratio - elif ratio_rcp > 1: - # data has different split-adjustment than fine-grained data - # Adjust fine-grained to match - df_fine[data_cols] *= 1.0 / ratio_rcp + if len(good_fields) > 0: + # median = df_row.loc[good_fields].median() + # median_fine = _np.median(df_fine[good_fields].values) + # ratio = median/median_fine + # Better method to calculate split-adjustment: + df_fine_from_idx = df_fine[df_fine.index >= idx] + ratios = [] + for f in good_fields: + if f == "Low": + ratios.append(df_row[f] / df_fine_from_idx[f].min()) + elif f == "High": + ratios.append(df_row[f] / df_fine_from_idx[f].max()) + elif f == "Open": + ratios.append(df_row[f] / df_fine_from_idx[f].iloc[0]) + elif f == "Close": + ratios.append(df_row[f] / df_fine_from_idx[f].iloc[-1]) + ratio = _np.mean(ratios) + # + ratio_rcp = round(1.0 / ratio, 1) + ratio = round(ratio, 1) + if ratio == 1 and ratio_rcp == 1: + # Good! + pass + else: + if ratio > 1: + # data has different split-adjustment than fine-grained data + # Adjust fine-grained to match + df_fine[data_cols] *= ratio + elif ratio_rcp > 1: + # data has different split-adjustment than fine-grained data + # Adjust fine-grained to match + df_fine[data_cols] *= 1.0 / ratio_rcp if sub_interval != "1h": df_last_week = df_fine[df_fine.index < idx] @@ -483,6 +482,8 @@ def _reconstruct_interval(self, df_row, interval, bad_fields): new_vals["Close"] = df_fine["Close"].iloc[-1] # Assume 'Adj Close' also corrupted, easier than detecting whether true new_vals["Adj Close"] = df_fine["Adj Close"].iloc[-1] + if "Volume" in bad_fields: + new_vals["Volume"] = df_fine["Volume"].sum() return new_vals @@ -567,7 +568,9 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): return df2 def _fix_zero_prices(self, df, interval, tz_exchange): - # Sometimes Yahoo returns prices=0 when obviously wrong e.g. Volume>0 and Close>0. + # Sometimes Yahoo returns prices=0 or NaN, but obviously wrong because e.g.: + # - Volume > 0 and Close > 0 + # - Dividends or Stock Splits > 0 # Easy to detect and fix if df.shape[0] == 0: @@ -583,15 +586,17 @@ def _fix_zero_prices(self, df, interval, tz_exchange): else: df2.index = df2.index.tz_convert(tz_exchange) - data_cols = ["Open", "High", "Low", "Close"] - data_cols = [c for c in data_cols if c in df2.columns] - f_zeroes = (df2[data_cols] == 0.0).values.any(axis=1) + data_cols = [c for c in ["Open", "High", "Low", "Close"] if c in df2.columns] + f_zero_or_nan = (df2[data_cols] == 0.0).values.any(axis=1) | df2[data_cols].isna().values.any(axis=1) + f_fixable = (df2[[c for c in ["Close","Volume","Dividends","Stock Splits"] if c in df2.columns]]>0).any(axis=1) + f_repair = f_zero_or_nan & f_fixable n_fixed = 0 - for i in _np.where(f_zeroes)[0]: + data_cols += ["Adj Close", "Volume"] + for i in _np.where(f_repair)[0]: idx = df2.index[i] - df_row = df2.loc[idx] - bad_fields = df2.columns[df_row.values == 0.0].values + df_row = df2.loc[idx, data_cols] + bad_fields = df_row.index[(df_row.values==0.0)|df_row.isna().values].values new_values = self._reconstruct_interval(df2.loc[idx], interval, bad_fields) if not new_values is None: for k in new_values: diff --git a/yfinance/utils.py b/yfinance/utils.py index 95c468c92..24bc1d3af 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -301,6 +301,7 @@ def _parse_user_dt(dt, exchange_tz): def auto_adjust(data): + col_order = data.columns df = data.copy() ratio = df["Close"] / df["Adj Close"] df["Adj Open"] = df["Open"] / ratio @@ -316,13 +317,13 @@ def auto_adjust(data): "Adj Low": "Low", "Adj Close": "Close" }, inplace=True) - df = df[["Open", "High", "Low", "Close", "Volume"]] - return df[["Open", "High", "Low", "Close", "Volume"]] + return df[[c for c in col_order if c in df.columns]] def back_adjust(data): """ back-adjusted data to mimic true historical prices """ + col_order = data.columns df = data.copy() ratio = df["Adj Close"] / df["Close"] df["Adj Open"] = df["Open"] * ratio @@ -338,7 +339,7 @@ def back_adjust(data): "Adj Low": "Low" }, inplace=True) - return df[["Open", "High", "Low", "Close", "Volume"]] + return df[[c for c in col_order if c in df.columns]] def parse_quotes(data): @@ -589,13 +590,15 @@ def _reindex_events(df, new_index, data_col_name): ## Not always possible to match events with trading, e.g. when released pre-market. ## So have to append to bottom with nan prices. ## But should only be impossible with intra-day price data. - if interval.endswith('m') or interval.endswith('h'): + if interval.endswith('m') or interval.endswith('h') or interval == "1d": + # Update: is possible with daily data when dividend very recent f_missing = ~df_sub.index.isin(df.index) df_sub_missing = df_sub[f_missing] keys = {"Adj Open", "Open", "Adj High", "High", "Adj Low", "Low", "Adj Close", "Close"}.intersection(df.columns) df_sub_missing[list(keys)] = _np.nan - df = _pd.concat([df, df_sub_missing], sort=True) + col_ordering = df.columns + df = _pd.concat([df, df_sub_missing], sort=True)[col_ordering] else: raise Exception("Lost data during merge despite all attempts to align data (see above)") From 762d446661d5f41c194689d010829742bf4f043b Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Thu, 1 Dec 2022 18:49:43 +0000 Subject: [PATCH 33/41] Default enable 'pretty' financials, explain in README --- README.md | 10 +-- tests/ticker.py | 171 ++++++++++++++++++++++++++++++++++----------- yfinance/ticker.py | 12 ++-- 3 files changed, 142 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index b5746c3b7..98beaf180 100644 --- a/README.md +++ b/README.md @@ -75,17 +75,17 @@ msft.capital_gains # show share count msft.shares -# show income statement +# show financials: +# - income statement msft.income_stmt msft.quarterly_income_stmt - -# show balance sheet +# - balance sheet msft.balance_sheet msft.quarterly_balance_sheet - -# show cash flow statement +# - cash flow statement msft.cashflow msft.quarterly_cashflow +# other presentations available, see `Ticker.get_income_stmt(as_dict, pretty)` # show major holders msft.major_holders diff --git a/tests/ticker.py b/tests/ticker.py index b1c2eaae8..4e9a0fe90 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -274,88 +274,179 @@ def tearDown(self): self.ticker = None def test_income_statement(self): - expected_row = "TotalRevenue" - data = self.ticker.income_stmt + expected_keys = ["Total Revenue", "Basic EPS"] + expected_periods_days = 365 + + # Test contents of table + data = self.ticker.get_income_stmt(pretty=True) self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") - self.assertIn(expected_row, data.index, "Did not find expected row in index") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning annual financials") - data_cached = self.ticker.income_stmt - self.assertIs(data, data_cached, "data not cached") + # Test property defaults + data2 = self.ticker.income_stmt + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") - def test_quarterly_income_statement(self): - expected_row = "TotalRevenue" - data = self.ticker.quarterly_income_stmt + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_income_stmt(pretty=False) self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") - self.assertIn(expected_row, data.index, "Did not find expected row in index") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") - data_cached = self.ticker.quarterly_income_stmt - self.assertIs(data, data_cached, "data not cached") + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") - def test_income_statement_formatting(self): + + def test_quarterly_income_statement(self): expected_keys = ["Total Revenue", "Basic EPS"] - data = self.ticker.get_income_stmt(pretty=True) + expected_periods_days = 365//4 + + # Test contents of table + data = self.ticker.get_income_stmt(pretty=True, freq="quarterly") self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") for k in expected_keys: self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning quarterly financials") + + # Test property defaults + data2 = self.ticker.quarterly_income_stmt + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") + + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_income_stmt(pretty=False, freq="quarterly") + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") def test_balance_sheet(self): - expected_row = "TotalAssets" - data = self.ticker.balance_sheet + expected_keys = ["Total Assets", "Net PPE"] + expected_periods_days = 365 + + # Test contents of table + data = self.ticker.get_balance_sheet(pretty=True) self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") - self.assertIn(expected_row, data.index, "Did not find expected row in index") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning annual financials") - data_cached = self.ticker.balance_sheet - self.assertIs(data, data_cached, "data not cached") + # Test property defaults + data2 = self.ticker.balance_sheet + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") - def test_quarterly_balance_sheet(self): - expected_row = "TotalAssets" - data = self.ticker.quarterly_balance_sheet + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_balance_sheet(pretty=False) self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") - self.assertIn(expected_row, data.index, "Did not find expected row in index") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") - data_cached = self.ticker.quarterly_balance_sheet - self.assertIs(data, data_cached, "data not cached") + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") - def test_balance_sheet_formatting(self): + def test_quarterly_balance_sheet(self): expected_keys = ["Total Assets", "Net PPE"] - data = self.ticker.get_balance_sheet(pretty=True) + expected_periods_days = 365//4 + + # Test contents of table + data = self.ticker.get_balance_sheet(pretty=True, freq="quarterly") self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") for k in expected_keys: self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning quarterly financials") - def test_cashflow(self): - expected_row = "OperatingCashFlow" - data = self.ticker.cashflow + # Test property defaults + data2 = self.ticker.quarterly_balance_sheet + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") + + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_balance_sheet(pretty=False, freq="quarterly") self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") - self.assertIn(expected_row, data.index, "Did not find expected row in index") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") - data_cached = self.ticker.cashflow - self.assertIs(data, data_cached, "data not cached") + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") + + def test_cash_flow(self): + expected_keys = ["Operating Cash Flow", "Net PPE Purchase And Sale"] + expected_periods_days = 365 - def test_quarterly_cashflow(self): - expected_row = "OperatingCashFlow" - data = self.ticker.quarterly_cashflow + # Test contents of table + data = self.ticker.get_cashflow(pretty=True) self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") - self.assertIn(expected_row, data.index, "Did not find expected row in index") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning annual financials") - data_cached = self.ticker.quarterly_cashflow - self.assertIs(data, data_cached, "data not cached") + # Test property defaults + data2 = self.ticker.cashflow + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") + + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_cashflow(pretty=False) + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") - def test_cashflow_formatting(self): + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") + + def test_quarterly_cash_flow(self): expected_keys = ["Operating Cash Flow", "Net PPE Purchase And Sale"] - data = self.ticker.get_cashflow(pretty=True) + expected_periods_days = 365//4 + + # Test contents of table + data = self.ticker.get_cashflow(pretty=True, freq="quarterly") self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") for k in expected_keys: self.assertIn(k, data.index, "Did not find expected row in index") + period = abs((data.columns[0]-data.columns[1]).days) + self.assertLess(abs(period-expected_periods_days), 20, "Not returning quarterly financials") + + # Test property defaults + data2 = self.ticker.quarterly_cashflow + self.assertTrue(data.equals(data2), "property not defaulting to 'pretty=True'") + + # Test pretty=False + expected_keys = [k.replace(' ', '') for k in expected_keys] + data = self.ticker.get_cashflow(pretty=False, freq="quarterly") + self.assertIsInstance(data, pd.DataFrame, "data has wrong type") + self.assertFalse(data.empty, "data is empty") + for k in expected_keys: + self.assertIn(k, data.index, "Did not find expected row in index") + + # Test to_dict + data = self.ticker.get_income_stmt(as_dict=True) + self.assertIsInstance(data, dict, "data has wrong type") def test_sustainability(self): data = self.ticker.sustainability diff --git a/yfinance/ticker.py b/yfinance/ticker.py index 3874cc630..0ee3c4577 100644 --- a/yfinance/ticker.py +++ b/yfinance/ticker.py @@ -155,19 +155,19 @@ def quarterly_earnings(self) -> _pd.DataFrame: @property def income_stmt(self) -> _pd.DataFrame: - return self.get_income_stmt() + return self.get_income_stmt(pretty=True) @property def quarterly_income_stmt(self) -> _pd.DataFrame: - return self.get_income_stmt(freq='quarterly') + return self.get_income_stmt(pretty=True, freq='quarterly') @property def balance_sheet(self) -> _pd.DataFrame: - return self.get_balance_sheet() + return self.get_balance_sheet(pretty=True) @property def quarterly_balance_sheet(self) -> _pd.DataFrame: - return self.get_balance_sheet(freq='quarterly') + return self.get_balance_sheet(pretty=True, freq='quarterly') @property def balancesheet(self) -> _pd.DataFrame: @@ -179,11 +179,11 @@ def quarterly_balancesheet(self) -> _pd.DataFrame: @property def cashflow(self) -> _pd.DataFrame: - return self.get_cashflow(freq="yearly") + return self.get_cashflow(pretty=True, freq="yearly") @property def quarterly_cashflow(self) -> _pd.DataFrame: - return self.get_cashflow(freq='quarterly') + return self.get_cashflow(pretty=True, freq='quarterly') @property def recommendations_summary(self): From b3dbbc46e28eaaf4b1cccb08047b2f3cfb02f5d0 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Tue, 6 Dec 2022 18:04:30 +0000 Subject: [PATCH 34/41] If fetching price history ending in future, don't use cache --- yfinance/base.py | 22 +++++++++++++++++----- yfinance/data.py | 7 +++++-- yfinance/scrapers/fundamentals.py | 2 +- yfinance/scrapers/holders.py | 2 +- yfinance/scrapers/quote.py | 2 +- 5 files changed, 25 insertions(+), 10 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 53c1adff2..0aaa48316 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -185,7 +185,18 @@ def history(self, period="1mo", interval="1d", data = None try: - data = self._data.get( + get_fn = self._data.get + if end is not None: + end_dt = _pd.Timestamp(end, unit='s').tz_localize("UTC") + dt_now = _datetime.datetime.utcnow().astimezone(end_dt.tzinfo) + print(dt_now) + dt_now = end_dt.tzinfo.localize(_datetime.datetime.utcnow()) + print(dt_now) + # if end_dt.date() <= dt_now.date(): + if end_dt <= dt_now: + # Date range in past so safe to fetch through cache: + get_fn = self._data.cache_get + data = get_fn( url=url, params=params, timeout=timeout @@ -197,6 +208,7 @@ def history(self, period="1mo", interval="1d", data = data.json() except Exception: + raise pass err_msg = "No data found for this date range, symbol may be delisted" @@ -639,7 +651,7 @@ def _fetch_ticker_tz(self, debug_mode, proxy, timeout): url = "{}/v8/finance/chart/{}".format(self._base_url, self.ticker) try: - data = self._data.get(url=url, params=params, proxy=proxy, timeout=timeout) + data = self._data.cache_get(url=url, params=params, proxy=proxy, timeout=timeout) data = data.json() except Exception as e: if debug_mode: @@ -859,7 +871,7 @@ def get_isin(self, proxy=None) -> Optional[str]: url = 'https://markets.businessinsider.com/ajax/' \ 'SearchController_Suggest?max_results=25&query=%s' \ % urlencode(q) - data = self._data.get(url=url, proxy=proxy).text + data = self._data.cache_get(url=url, proxy=proxy).text search_str = '"{}|'.format(ticker) if search_str not in data: @@ -881,7 +893,7 @@ def get_news(self, proxy=None): # Getting data from json url = "{}/v1/finance/search?q={}".format(self._base_url, self.ticker) - data = self._data.get(url=url, proxy=proxy) + data = self._data.cache_get(url=url, proxy=proxy) if "Will be right back" in data.text: raise RuntimeError("*** YAHOO! FINANCE IS CURRENTLY DOWN! ***\n" "Our engineers are working quickly to resolve " @@ -912,7 +924,7 @@ def get_earnings_dates(self, limit=12, proxy=None) -> Optional[pd.DataFrame]: url = "{}/calendar/earnings?symbol={}&offset={}&size={}".format( _ROOT_URL_, self.ticker, page_offset, page_size) - data = self._data.get(url=url, proxy=proxy).text + data = self._data.cache_get(url=url, proxy=proxy).text if "Will be right back" in data: raise RuntimeError("*** YAHOO! FINANCE IS CURRENTLY DOWN! ***\n" diff --git a/yfinance/data.py b/yfinance/data.py index ef3f6cde1..fb507d50b 100644 --- a/yfinance/data.py +++ b/yfinance/data.py @@ -49,8 +49,6 @@ def __init__(self, ticker: str, session=None): self.ticker = ticker self._session = session or requests - @lru_cache_freezeargs - @lru_cache(maxsize=cache_maxsize) def get(self, url, user_agent_headers=None, params=None, proxy=None, timeout=30): proxy = self._get_proxy(proxy) response = self._session.get( @@ -61,6 +59,11 @@ def get(self, url, user_agent_headers=None, params=None, proxy=None, timeout=30) headers=user_agent_headers or self.user_agent_headers) return response + @lru_cache_freezeargs + @lru_cache(maxsize=cache_maxsize) + def cache_get(self, url, user_agent_headers=None, params=None, proxy=None, timeout=30): + return self.get(url, user_agent_headers, params, proxy, timeout) + def _get_proxy(self, proxy): # setup proxy in requests format if proxy is not None: diff --git a/yfinance/scrapers/fundamentals.py b/yfinance/scrapers/fundamentals.py index a0cc9cc56..34f6f6e1e 100644 --- a/yfinance/scrapers/fundamentals.py +++ b/yfinance/scrapers/fundamentals.py @@ -196,7 +196,7 @@ def get_financials_time_series(self, timescale, keys: list, proxy=None) -> pd.Da url += "&period1={}&period2={}".format(int(start_dt.timestamp()), int(end.timestamp())) # Step 3: fetch and reshape data - json_str = self._data.get(url=url, proxy=proxy).text + json_str = self._data.cache_get(url=url, proxy=proxy).text json_data = json.loads(json_str) data_raw = json_data["timeseries"]["result"] # data_raw = [v for v in data_raw if len(v) > 1] # Discard keys with no data diff --git a/yfinance/scrapers/holders.py b/yfinance/scrapers/holders.py index c130c22fa..76faad748 100644 --- a/yfinance/scrapers/holders.py +++ b/yfinance/scrapers/holders.py @@ -34,7 +34,7 @@ def mutualfund(self) -> pd.DataFrame: def _scrape(self, proxy): ticker_url = "{}/{}".format(self._SCRAPE_URL_, self._data.ticker) try: - resp = self._data.get(ticker_url + '/holders', proxy) + resp = self._data.cache_get(ticker_url + '/holders', proxy) holders = pd.read_html(resp.text) except Exception: holders = [] diff --git a/yfinance/scrapers/quote.py b/yfinance/scrapers/quote.py index a0f1dac6d..d14078435 100644 --- a/yfinance/scrapers/quote.py +++ b/yfinance/scrapers/quote.py @@ -198,7 +198,7 @@ def _scrape_complementary(self, proxy): int((datetime.datetime.now() - datetime.timedelta(days=365 // 2)).timestamp())) url += "&period2={}".format(int((datetime.datetime.now() + datetime.timedelta(days=1)).timestamp())) - json_str = self._data.get(url=url, proxy=proxy).text + json_str = self._data.cache_get(url=url, proxy=proxy).text json_data = json.loads(json_str) key_stats = json_data["timeseries"]["result"][0] if k not in key_stats: From 4c89e8aefa0ccead4695f78c60eb9403688457b6 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Sat, 10 Dec 2022 18:27:23 +0000 Subject: [PATCH 35/41] Account for data delay ; Remove debug code ; Fix session test --- tests/ticker.py | 5 +++-- yfinance/base.py | 8 ++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/ticker.py b/tests/ticker.py index b1c2eaae8..70e45453b 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -149,11 +149,12 @@ def test_no_expensive_calls_introduced(self): session = requests_cache.CachedSession(backend='memory') ticker = yf.Ticker("GOOGL", session=session) ticker.history("1y") - actual_urls_called = tuple(session.cache.urls) + actual_urls_called = tuple([r.url for r in session.cache.filter()]) + session.close() expected_urls = ( 'https://query2.finance.yahoo.com/v8/finance/chart/GOOGL?range=1y&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains', ) - self.assertEquals(expected_urls, actual_urls_called, "Different than expected url used to fetch history.") + self.assertEqual(expected_urls, actual_urls_called, "Different than expected url used to fetch history.") def test_dividends(self): data = self.ticker.dividends diff --git a/yfinance/base.py b/yfinance/base.py index 0aaa48316..1769460ee 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -188,12 +188,9 @@ def history(self, period="1mo", interval="1d", get_fn = self._data.get if end is not None: end_dt = _pd.Timestamp(end, unit='s').tz_localize("UTC") - dt_now = _datetime.datetime.utcnow().astimezone(end_dt.tzinfo) - print(dt_now) dt_now = end_dt.tzinfo.localize(_datetime.datetime.utcnow()) - print(dt_now) - # if end_dt.date() <= dt_now.date(): - if end_dt <= dt_now: + data_delay = _datetime.timedelta(minutes=30) + if end_dt+data_delay <= dt_now: # Date range in past so safe to fetch through cache: get_fn = self._data.cache_get data = get_fn( @@ -208,7 +205,6 @@ def history(self, period="1mo", interval="1d", data = data.json() except Exception: - raise pass err_msg = "No data found for this date range, symbol may be delisted" From e91ffe48445a521770bd17b3cacc3d2355d41b48 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Sat, 10 Dec 2022 20:59:34 +0000 Subject: [PATCH 36/41] Replace 'fallback' with 'legacy' arg --- README.md | 10 +++++----- yfinance/base.py | 28 ++++++++++++++-------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 4b7af1128..9f23c189e 100644 --- a/README.md +++ b/README.md @@ -75,17 +75,17 @@ msft.capital_gains # show share count msft.shares -# show income statement +# Financials: +# - income statement msft.income_stmt msft.quarterly_income_stmt - -# show balance sheet +# - balance sheet msft.balance_sheet msft.quarterly_balance_sheet - -# show cash flow statement +# - cash flow statement msft.cashflow msft.quarterly_cashflow +# see `Ticker.get_income_stmt()` for more options # show major holders msft.major_holders diff --git a/yfinance/base.py b/yfinance/base.py index 30513924c..42c8b713d 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -757,13 +757,13 @@ def get_earnings(self, proxy=None, as_dict=False, freq="yearly"): return dict_data return data - def get_income_stmt(self, proxy=None, as_dict=False, pretty=False, freq="yearly", fallback=True): + def get_income_stmt(self, proxy=None, as_dict=False, pretty=False, freq="yearly", legacy=False): self._fundamentals.proxy = proxy - data = self._fundamentals.financials.get_income_time_series(freq=freq, proxy=proxy) - if (data is None or data.empty) and fallback: - print(f"{self.ticker}: Yahoo not displaying {freq}-income so falling back to old table format") + if legacy: data = self._fundamentals.financials.get_income_scrape(freq=freq, proxy=proxy) + else: + data = self._fundamentals.financials.get_income_time_series(freq=freq, proxy=proxy) if pretty: data.index = utils.camel2title(data.index, sep=' ', acronyms=["EBIT", "EBITDA", "EPS", "NI"]) @@ -771,27 +771,27 @@ def get_income_stmt(self, proxy=None, as_dict=False, pretty=False, freq="yearly" return data.to_dict() return data - def get_balance_sheet(self, proxy=None, as_dict=False, pretty=False, freq="yearly", fallback=True): + def get_balance_sheet(self, proxy=None, as_dict=False, pretty=False, freq="yearly", legacy=False): self._fundamentals.proxy = proxy - data = self._fundamentals.financials.get_balance_sheet_time_series(freq=freq, proxy=proxy) - if (data is None or data.empty) and fallback: - print(f"{self.ticker}: Yahoo not displaying {freq}-balance-sheet so falling back to old table format") + if legacy: data = self._fundamentals.financials.get_balance_sheet_scrape(freq=freq, proxy=proxy) - + else: + data = self._fundamentals.financials.get_balance_sheet_time_series(freq=freq, proxy=proxy) + if pretty: data.index = utils.camel2title(data.index, sep=' ', acronyms=["PPE"]) if as_dict: return data.to_dict() return data - def get_cashflow(self, proxy=None, as_dict=False, pretty=False, freq="yearly", fallback=True): + def get_cashflow(self, proxy=None, as_dict=False, pretty=False, freq="yearly", legacy=False): self._fundamentals.proxy = proxy - data = self._fundamentals.financials.get_cash_flow_time_series(freq=freq, proxy=proxy) - - if (data is None or data.empty) and fallback: - print(f"{self.ticker}: Yahoo not displaying {freq}-cashflow so falling back to old table format") + + if legacy: data = self._fundamentals.financials.get_cash_flow_scrape(freq=freq, proxy=proxy) + else: + data = self._fundamentals.financials.get_cash_flow_time_series(freq=freq, proxy=proxy) if pretty: data.index = utils.camel2title(data.index, sep=' ', acronyms=["PPE"]) From 6f60a782625c4d64dd4bfe2a937ab90f044aa9f1 Mon Sep 17 00:00:00 2001 From: ymyke Date: Mon, 12 Dec 2022 17:16:05 +0100 Subject: [PATCH 37/41] Add `history_metadata` property Including test and README mention. See also https://github.com/ranaroussi/yfinance/issues/1195. --- README.md | 3 +++ tests/ticker.py | 3 +++ yfinance/base.py | 13 +++++++++++++ yfinance/ticker.py | 4 ++++ 4 files changed, 23 insertions(+) diff --git a/README.md b/README.md index fd7348fff..e493ed501 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,9 @@ msft.info # get historical market data hist = msft.history(period="max") +# show meta information about the history (requires history() to be called first) +msft.history_metadata + # show actions (dividends, splits, capital gains) msft.actions diff --git a/tests/ticker.py b/tests/ticker.py index 7f854e7c9..d75b1c6fe 100644 --- a/tests/ticker.py +++ b/tests/ticker.py @@ -136,7 +136,10 @@ def tearDown(self): self.ticker = None def test_history(self): + with self.assertRaises(RuntimeError): + self.ticker.history_metadata data = self.ticker.history("1y") + self.assertIn("IBM", self.ticker.history_metadata.values(), "metadata missing") self.assertIsInstance(data, pd.DataFrame, "data has wrong type") self.assertFalse(data.empty, "data is empty") diff --git a/yfinance/base.py b/yfinance/base.py index 9e6d144dd..b92fe767f 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -51,6 +51,7 @@ def __init__(self, ticker, session=None): self.ticker = ticker.upper() self.session = session self._history = None + self._history_metadata = None self._base_url = _BASE_URL_ self._scrape_url = _SCRAPE_URL_ self._tz = None @@ -234,6 +235,12 @@ def history(self, period="1mo", interval="1d", else: print('%s: %s' % (self.ticker, err_msg)) return utils.empty_df() + + # Store the meta data that gets retrieved simultaneously: + try: + self._history_metadata = data["chart"]["result"][0]["meta"] + except KeyError: + self._history_metadata = {} # parse quotes try: @@ -1003,3 +1010,9 @@ def get_earnings_dates(self, limit=12, proxy=None) -> Optional[pd.DataFrame]: self._earnings_dates[limit] = dates return dates + + def get_history_metadata(self) -> dict: + if self._history_metadata is None: + raise RuntimeError("Metadata was never retrieved so far, " + "call history() to retrieve it") + return self._history_metadata \ No newline at end of file diff --git a/yfinance/ticker.py b/yfinance/ticker.py index 0ee3c4577..20371ca26 100644 --- a/yfinance/ticker.py +++ b/yfinance/ticker.py @@ -222,3 +222,7 @@ def earnings_dates(self) -> _pd.DataFrame: @property def earnings_forecasts(self) -> _pd.DataFrame: return self.get_earnings_forecast() + + @property + def history_metadata(self) -> dict: + return self.get_history_metadata() From 4c41ba0a5038fccbbd0e369cd5039d1bcc408fbb Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Mon, 12 Dec 2022 16:43:24 +0000 Subject: [PATCH 38/41] Improve price repair Minimise _reconstruct_intervals() #requests ; Refine when to repair NaNs --- tests/prices.py | 83 ++++++--- yfinance/base.py | 416 +++++++++++++++++++++++++++++++--------------- yfinance/utils.py | 10 ++ 3 files changed, 352 insertions(+), 157 deletions(-) diff --git a/tests/prices.py b/tests/prices.py index 0b43248dc..e0d722578 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -270,11 +270,7 @@ def test_weekly_2rows_fix(self): df = dat.history(start=start, interval="1wk") self.assertTrue((df.index.weekday == 0).all()) - def test_repair_weekly_100x(self): - # Sometimes, Yahoo returns prices 100x the correct value. - # Suspect mixup between £/pence or $/cents etc. - # E.g. ticker PNL.L - + def test_repair_100x_weekly(self): # Setup: tkr = "PNL.L" dat = yf.Ticker(tkr, session=self.session) @@ -291,6 +287,7 @@ def test_repair_weekly_100x(self): _dt.date(2022, 10, 16), _dt.date(2022, 10, 9), _dt.date(2022, 10, 2)])) + df = df.sort_index() df.index.name = "Date" df_bad = df.copy() df_bad.loc["2022-10-23", "Close"] *= 100 @@ -305,7 +302,13 @@ def test_repair_weekly_100x(self): # First test - no errors left for c in data_cols: - self.assertTrue(_np.isclose(df_repaired[c], df[c], rtol=1e-2).all()) + try: + self.assertTrue(_np.isclose(df_repaired[c], df[c], rtol=1e-2).all()) + except: + print(df[c]) + print(df_repaired[c]) + raise + # Second test - all differences should be either ~1x or ~100x ratio = df_bad[data_cols].values / df[data_cols].values @@ -318,11 +321,7 @@ def test_repair_weekly_100x(self): f_1 = ratio == 1 self.assertTrue((f_100 | f_1).all()) - def test_repair_weekly_preSplit_100x(self): - # Sometimes, Yahoo returns prices 100x the correct value. - # Suspect mixup between £/pence or $/cents etc. - # E.g. ticker PNL.L - + def test_repair_100x_weekly_preSplit(self): # PNL.L has a stock-split in 2022. Sometimes requesting data before 2022 is not split-adjusted. tkr = "PNL.L" @@ -340,6 +339,7 @@ def test_repair_weekly_preSplit_100x(self): _dt.date(2020, 3, 23), _dt.date(2020, 3, 16), _dt.date(2020, 3, 9)])) + df = df.sort_index() # Simulate data missing split-adjustment: df[data_cols] *= 100.0 df["Volume"] *= 0.01 @@ -378,11 +378,7 @@ def test_repair_weekly_preSplit_100x(self): f_1 = ratio == 1 self.assertTrue((f_100 | f_1).all()) - def test_repair_daily_100x(self): - # Sometimes, Yahoo returns prices 100x the correct value. - # Suspect mixup between £/pence or $/cents etc. - # E.g. ticker PNL.L - + def test_repair_100x_daily(self): tkr = "PNL.L" dat = yf.Ticker(tkr, session=self.session) tz_exchange = dat.info["exchangeTimezoneName"] @@ -398,6 +394,7 @@ def test_repair_daily_100x(self): _dt.date(2022, 10, 31), _dt.date(2022, 10, 28), _dt.date(2022, 10, 27)])) + df = df.sort_index() df.index.name = "Date" df_bad = df.copy() df_bad.loc["2022-11-01", "Close"] *= 100 @@ -423,10 +420,7 @@ def test_repair_daily_100x(self): f_1 = ratio == 1 self.assertTrue((f_100 | f_1).all()) - def test_repair_daily_zeroes(self): - # Sometimes Yahoo returns price=0.0 when price obviously not zero - # E.g. ticker BBIL.L - + def test_repair_zeroes_daily(self): tkr = "BBIL.L" dat = yf.Ticker(tkr, session=self.session) tz_exchange = dat.info["exchangeTimezoneName"] @@ -440,18 +434,59 @@ def test_repair_daily_zeroes(self): index=_pd.to_datetime([_dt.datetime(2022, 11, 1), _dt.datetime(2022, 10, 31), _dt.datetime(2022, 10, 30)])) + df_bad = df_bad.sort_index() df_bad.index.name = "Date" df_bad.index = df_bad.index.tz_localize(tz_exchange) - repaired_df = dat._fix_zero_prices(df_bad, "1d", tz_exchange) + repaired_df = dat._fix_zeroes(df_bad, "1d", tz_exchange) correct_df = df_bad.copy() - correct_df.loc[correct_df.index[0], "Open"] = 102.080002 - correct_df.loc[correct_df.index[0], "Low"] = 102.032501 - correct_df.loc[correct_df.index[0], "High"] = 102.080002 + correct_df.loc["2022-11-01", "Open"] = 102.080002 + correct_df.loc["2022-11-01", "Low"] = 102.032501 + correct_df.loc["2022-11-01", "High"] = 102.080002 for c in ["Open", "Low", "High", "Close"]: self.assertTrue(_np.isclose(repaired_df[c], correct_df[c], rtol=1e-8).all()) + def test_repair_zeroes_hourly(self): + tkr = "INTC" + dat = yf.Ticker(tkr, session=self.session) + tz_exchange = dat.info["exchangeTimezoneName"] + + df_bad = _pd.DataFrame(data={"Open": [29.68, 29.49, 29.545, _np.nan, 29.485], + "High": [29.68, 29.625, 29.58, _np.nan, 29.49], + "Low": [29.46, 29.4, 29.45, _np.nan, 29.31], + "Close": [29.485, 29.545, 29.485, _np.nan, 29.325], + "Adj Close": [29.485, 29.545, 29.485, _np.nan, 29.325], + "Volume": [3258528, 2140195, 1621010, 0, 0]}, + index=_pd.to_datetime([_dt.datetime(2022,11,25, 9,30), + _dt.datetime(2022,11,25, 10,30), + _dt.datetime(2022,11,25, 11,30), + _dt.datetime(2022,11,25, 12,30), + _dt.datetime(2022,11,25, 13,00)])) + df_bad = df_bad.sort_index() + df_bad.index.name = "Date" + df_bad.index = df_bad.index.tz_localize(tz_exchange) + + repaired_df = dat._fix_zeroes(df_bad, "1h", tz_exchange) + + correct_df = df_bad.copy() + idx = _pd.Timestamp(2022,11,25, 12,30).tz_localize(tz_exchange) + correct_df.loc[idx, "Open"] = 29.485001 + correct_df.loc[idx, "High"] = 29.49 + correct_df.loc[idx, "Low"] = 29.43 + correct_df.loc[idx, "Close"] = 29.455 + correct_df.loc[idx, "Adj Close"] = 29.455 + correct_df.loc[idx, "Volume"] = 609164 + for c in ["Open", "Low", "High", "Close"]: + try: + self.assertTrue(_np.isclose(repaired_df[c], correct_df[c], rtol=1e-7).all()) + except: + print("COLUMN", c) + print(repaired_df) + print(correct_df[c]) + print(repaired_df[c] - correct_df[c]) + raise + if __name__ == '__main__': unittest.main() diff --git a/yfinance/base.py b/yfinance/base.py index 53c1adff2..8a095beef 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -340,7 +340,7 @@ def history(self, period="1mo", interval="1d", if repair: # Do this before auto/back adjust - df = self._fix_zero_prices(df, interval, tz_exchange) + df = self._fix_zeroes(df, interval, tz_exchange) df = self._fix_unit_mixups(df, interval, tz_exchange) # Auto/back adjust @@ -385,13 +385,14 @@ def history(self, period="1mo", interval="1d", # ------------------------ - def _reconstruct_interval(self, df_row, interval, bad_fields): - if isinstance(df_row, _pd.DataFrame) or not isinstance(df_row, _pd.Series): - raise Exception("'df_row' must be a Pandas Series not", type(df_row)) - if not isinstance(bad_fields, (list, set, _np.ndarray)): - raise Exception("'bad_fields' must be a list/set not", type(bad_fields)) + def _reconstruct_intervals_batch(self, df, interval, tag=-1): + if not isinstance(df, _pd.DataFrame): + raise Exception("'df' must be a Pandas DataFrame not", type(df)) - data_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df_row.index] + # Reconstruct values in df using finer-grained price data. Delimiter marks what to reconstruct + + price_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df] + data_cols = price_cols + ["Volume"] # If interval is weekly then can construct with daily. But if smaller intervals then # restricted to recent times: @@ -406,43 +407,147 @@ def _reconstruct_interval(self, df_row, interval, bad_fields): # Correct by fetching day of hourly data sub_interval = "1h" td_range = _datetime.timedelta(days=1) + elif interval == "1h": + sub_interval = "30m" + td_range = _datetime.timedelta(hours=1) else: print("WARNING: Have not implemented repair for '{}' interval. Contact developers".format(interval)) - return df_row + raise Exception("why here") + return df - idx = df_row.name - start = idx.date() - if sub_interval == "1h" and (_datetime.date.today() - start) > _datetime.timedelta(days=729): - # Don't bother requesting more price data, Yahoo will reject - return None + df = df.sort_index() + + f_repair = df[data_cols].to_numpy()==tag + f_repair_rows = f_repair.any(axis=1) + + # Ignore old intervals for which Yahoo won't return finer data: + if sub_interval == "1h": + f_recent = _datetime.date.today() - df.index.date < _datetime.timedelta(days=730) + f_repair_rows = f_repair_rows & f_recent + elif sub_interval in ["30m", "15m"]: + f_recent = _datetime.date.today() - df.index.date < _datetime.timedelta(days=60) + f_repair_rows = f_repair_rows & f_recent + if not f_repair_rows.any(): + print("data too old to fix") + return df + + dts_to_repair = df.index[f_repair_rows] + indices_to_repair = _np.where(f_repair_rows)[0] + + if len(dts_to_repair) == 0: + return df + + df_v2 = df.copy() + df_noNa = df[~df[price_cols].isna().any(axis=1)] + + # Group nearby NaN-intervals together to reduce number of Yahoo fetches + dts_groups = [[dts_to_repair[0]]] + last_dt = dts_to_repair[0] + last_ind = indices_to_repair[0] + td = utils._interval_to_timedelta(interval) + if interval == "1mo": + grp_td_threshold = _datetime.timedelta(days=28) + elif interval == "1wk": + grp_td_threshold = _datetime.timedelta(days=28) + elif interval == "1d": + grp_td_threshold = _datetime.timedelta(days=14) + elif interval == "1h": + grp_td_threshold = _datetime.timedelta(days=7) else: - new_vals = {} + grp_td_threshold = _datetime.timedelta(days=2) + # grp_td_threshold = _datetime.timedelta(days=7) + for i in range(1, len(dts_to_repair)): + ind = indices_to_repair[i] + dt = dts_to_repair[i] + if (dt-dts_groups[-1][-1]) < grp_td_threshold: + dts_groups[-1].append(dt) + elif ind - last_ind <= 3: + dts_groups[-1].append(dt) + else: + dts_groups.append([dt]) + last_dt = dt + last_ind = ind + + # Add some good data to each group, so can calibrate later: + for i in range(len(dts_groups)): + g = dts_groups[i] + g0 = g[0] + i0 = df_noNa.index.get_loc(g0) + if i0 > 0: + dts_groups[i].insert(0, df_noNa.index[i0-1]) + gl = g[-1] + il = df_noNa.index.get_loc(gl) + if il < len(df_noNa)-1: + dts_groups[i].append(df_noNa.index[il+1]) - if sub_interval == "1h": - df_fine = self.history(start=start, end=start + td_range, interval=sub_interval, auto_adjust=False, prepost=True) + n_fixed = 0 + for g in dts_groups: + df_block = df[df.index.isin(g)] + + start_dt = g[0] + start_d = start_dt.date() + if sub_interval == "1h" and (_datetime.date.today() - start_d) > _datetime.timedelta(days=729): + # Don't bother requesting more price data, Yahoo will reject + continue + elif sub_interval in ["30m", "15m"] and (_datetime.date.today() - start_d) > _datetime.timedelta(days=59): + # Don't bother requesting more price data, Yahoo will reject + continue + + td_1d = _datetime.timedelta(days=1) + if interval in "1wk": + fetch_start = start_d - td_range # need previous week too + fetch_end = g[-1].date() + td_range + elif interval == "1d": + fetch_start = start_d + fetch_end = g[-1].date() + td_range + else: + fetch_start = g[0] + fetch_end = g[-1] + td_range + + prepost = interval == "1d" + df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, prepost=prepost, repair=False, keepna=True) + if df_fine is None or df_fine.empty: + print("YF: WARNING: Cannot reconstruct because Yahoo not returning data in interval") + continue + + df_fine["ctr"] = 0 + if interval == "1wk": + # df_fine["Week Start"] = df_fine.index.tz_localize(None).to_period("W-SUN").start_time + weekdays = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] + week_end_day = weekdays[(df_block.index[0].weekday()+7-1)%7] + df_fine["Week Start"] = df_fine.index.tz_localize(None).to_period("W-"+week_end_day).start_time + grp_col = "Week Start" + elif interval == "1d": + df_fine["Day Start"] = pd.to_datetime(df_fine.index.date) + grp_col = "Day Start" else: - df_fine = self.history(start=start - td_range, end=start + td_range, interval=sub_interval, - auto_adjust=False) - - # First, check whether df_fine has different split-adjustment than df_row. - # If different, then adjust df_fine to match df_row - good_fields = list(set(data_cols) - set(bad_fields) - set("Adj Close")) - if len(good_fields) > 0: - # median = df_row.loc[good_fields].median() - # median_fine = _np.median(df_fine[good_fields].values) - # ratio = median/median_fine - # Better method to calculate split-adjustment: - df_fine_from_idx = df_fine[df_fine.index >= idx] - ratios = [] - for f in good_fields: - if f == "Low": - ratios.append(df_row[f] / df_fine_from_idx[f].min()) - elif f == "High": - ratios.append(df_row[f] / df_fine_from_idx[f].max()) - elif f == "Open": - ratios.append(df_row[f] / df_fine_from_idx[f].iloc[0]) - elif f == "Close": - ratios.append(df_row[f] / df_fine_from_idx[f].iloc[-1]) + df_fine.loc[df_fine.index.isin(df_block.index), "ctr"] = 1 + df_fine["intervalID"] = df_fine["ctr"].cumsum() + df_fine = df_fine.drop("ctr", axis=1) + grp_col = "intervalID" + df_fine = df_fine[~df_fine[price_cols].isna().all(axis=1)] + + df_new = df_fine.groupby(grp_col).agg( + Open=("Open", "first"), + Close=("Close", "last"), + AdjClose=("Adj Close", "last"), + Low=("Low", "min"), + High=("High", "max"), + Volume=("Volume", "sum")).rename(columns={"AdjClose":"Adj Close"}) + if grp_col in ["Week Start", "Day Start"]: + df_new.index = df_new.index.tz_localize(df_fine.index.tz) + else: + df_fine["diff"] = df_fine["intervalID"].diff() + new_index = _np.append([df_fine.index[0]], df_fine.index[df_fine["intervalID"].diff()>0]) + df_new.index = new_index + + # Calibrate! Check whether 'df_fine' has different split-adjustment. + # If different, then adjust to match 'df' + df_block_calib = df_block[price_cols] + calib_filter = df_block_calib.to_numpy() != tag + if calib_filter.any(): + df_new_calib = df_new[df_new.index.isin(df_block_calib.index)][price_cols] + ratios = (df_block_calib[price_cols].to_numpy() / df_new_calib[price_cols].to_numpy())[calib_filter] ratio = _np.mean(ratios) # ratio_rcp = round(1.0 / ratio, 1) @@ -454,38 +559,51 @@ def _reconstruct_interval(self, df_row, interval, bad_fields): if ratio > 1: # data has different split-adjustment than fine-grained data # Adjust fine-grained to match - df_fine[data_cols] *= ratio + df_new[price_cols] *= ratio + df_new["Volume"] /= ratio elif ratio_rcp > 1: # data has different split-adjustment than fine-grained data # Adjust fine-grained to match - df_fine[data_cols] *= 1.0 / ratio_rcp - - if sub_interval != "1h": - df_last_week = df_fine[df_fine.index < idx] - df_fine = df_fine[df_fine.index >= idx] - - if "High" in bad_fields: - new_vals["High"] = df_fine["High"].max() - if "Low" in bad_fields: - new_vals["Low"] = df_fine["Low"].min() - if "Open" in bad_fields: - if sub_interval != "1h" and idx != df_fine.index[0]: - # Exchange closed Monday. In this case, Yahoo sets Open to last week close - new_vals["Open"] = df_last_week["Close"][-1] - if "Low" in new_vals: - new_vals["Low"] = min(new_vals["Open"], new_vals["Low"]) - elif new_vals["Open"] < df_row["Low"]: - new_vals["Low"] = new_vals["Open"] - else: - new_vals["Open"] = df_fine["Open"].iloc[0] - if "Close" in bad_fields: - new_vals["Close"] = df_fine["Close"].iloc[-1] - # Assume 'Adj Close' also corrupted, easier than detecting whether true - new_vals["Adj Close"] = df_fine["Adj Close"].iloc[-1] - if "Volume" in bad_fields: - new_vals["Volume"] = df_fine["Volume"].sum() + df_new[price_cols] *= 1.0 / ratio_rcp + df_new["Volume"] *= ratio_rcp + + # Repair! + bad_dts = df_block.index[(df_block[price_cols]==tag).any(axis=1)] + + for idx in bad_dts: + if not idx in df_new.index: + # Yahoo didn't return finer-grain data for this interval, + # so probably no trading happened. + print("no fine data") + continue + df_new_row = df_new.loc[idx] + + if interval == "1wk": + df_last_week = df_new.iloc[df_new.index.get_loc(idx)-1] + df_fine = df_fine.loc[idx:] + + df_bad_row = df.loc[idx] + bad_fields = df_bad_row.index[df_bad_row==tag].values + if "High" in bad_fields: + df_v2.loc[idx, "High"] = df_new_row["High"] + if "Low" in bad_fields: + df_v2.loc[idx, "Low"] = df_new_row["Low"] + if "Open" in bad_fields: + if interval == "1wk" and idx != df_fine.index[0]: + # Exchange closed Monday. In this case, Yahoo sets Open to last week close + df_v2.loc[idx, "Open"] = df_last_week["Close"] + df_v2.loc[idx, "Low"] = min(df_v2.loc[idx, "Open"], df_v2.loc[idx, "Low"]) + else: + df_v2.loc[idx, "Open"] = df_new_row["Open"] + if "Close" in bad_fields: + df_v2.loc[idx, "Close"] = df_new_row["Close"] + # Assume 'Adj Close' also corrupted, easier than detecting whether true + df_v2.loc[idx, "Adj Close"] = df_new_row["Adj Close"] + if "Volume" in bad_fields: + df_v2.loc[idx, "Volume"] = df_new_row["Volume"] + n_fixed += 1 - return new_vals + return df_v2 def _fix_unit_mixups(self, df, interval, tz_exchange): # Sometimes Yahoo returns few prices in cents/pence instead of $/£ @@ -518,66 +636,78 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): ratio = df2[data_cols].values / median ratio_rounded = (ratio / 20).round() * 20 # round ratio to nearest 20 f = ratio_rounded == 100 + if not f.any(): + return df - # Store each mixup: - mixups = {} - for j in range(len(data_cols)): - fj = f[:, j] - if fj.any(): - dc = data_cols[j] - for i in _np.where(fj)[0]: - idx = df2.index[i] - if idx not in mixups: - mixups[idx] = {"data": df2.loc[idx, data_cols], "fields": {dc}} - else: - mixups[idx]["fields"].add(dc) - n_mixups = len(mixups) - - if len(mixups) > 0: - # This first pass will correct all errors in Open/Close/AdjClose columns. - # It will also attempt to correct Low/High columns, but only if can get price data. - for idx in sorted(list(mixups.keys())): - m = mixups[idx] - new_values = self._reconstruct_interval(df2.loc[idx], interval, m["fields"]) - if not new_values is None: - for k in new_values: - df2.loc[idx, k] = new_values[k] - del mixups[idx] + # Mark values to send for repair + tag = -1.0 + for i in range(len(data_cols)): + fi = f[:,i] + c = data_cols[i] + df2.loc[fi, c] = tag + n_before = (df2[data_cols].to_numpy()==tag).sum() + df2 = self._reconstruct_intervals_batch(df2, interval, tag=tag) + n_after = (df2[data_cols].to_numpy()==tag).sum() + + if n_after > 0: # This second pass will *crudely* "fix" any remaining errors in High/Low - # simply by ensuring they don't contradict e.g. Low = 100x High - if len(mixups) > 0: - for idx in sorted(list(mixups.keys())): - m = mixups[idx] - row = df2.loc[idx, ["Open", "Close"]] - if "High" in m["fields"]: - df2.loc[idx, "High"] = row.max() - m["fields"].remove("High") - if "Low" in m["fields"]: - df2.loc[idx, "Low"] = row.min() - m["fields"].remove("Low") - - if len(m["fields"]) == 0: - del mixups[idx] - - n_fixed = n_mixups - len(mixups) - print("{}: fixed {} currency unit mixups in {} price data".format(self.ticker, n_fixed, interval)) - if len(mixups) > 0: - print(" ... and failed to correct {}".format(len(mixups))) + # simply by ensuring they don't contradict e.g. Low = 100x High. + f = df2[data_cols].to_numpy()==tag + for i in range(f.shape[0]): + fi = f[i,:] + if not fi.any(): + continue + idx = df2.index[i] + + c = "Open" + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df.loc[idx, c] * 0.01 + # + c = "Close" + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df.loc[idx, c] * 0.01 + # + c = "High" + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].max() + # + c = "Low" + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].min() + + n_after_crude = (df2[data_cols].to_numpy()==tag).sum() + + n_fixed = n_before - n_after_crude + n_fixed_crudely = n_after - n_after_crude + if n_fixed > 0: + report_msg = f"{self.ticker}: fixed {n_fixed}/{n_before} currency unit mixups " + if n_fixed_crudely > 0: + report_msg += f"({n_fixed_crudely} crudely) " + report_msg += f"in {interval} price data" + print(report_msg) + + # Restore original values where repair failed + f = df2[data_cols].values==tag + for j in range(len(data_cols)): + fj = f[:,j] + if fj.any(): + c = data_cols[j] + df2.loc[fj, c] = df.loc[fj, c] return df2 - def _fix_zero_prices(self, df, interval, tz_exchange): - # Sometimes Yahoo returns prices=0 or NaN, but obviously wrong because e.g.: - # - Volume > 0 and Close > 0 - # - Dividends or Stock Splits > 0 - # Easy to detect and fix + def _fix_zeroes(self, df, interval, tz_exchange): + # Sometimes Yahoo returns prices=0 or NaN when trades occurred. + # But most times when prices=0 or NaN returned is because no trades. + # Impossible to distinguish, so only attempt repair if few or rare. if df.shape[0] == 0: return df - if df.shape[0] == 1: - # Need multiple rows to confidently identify outliers - return df df2 = df.copy() @@ -586,25 +716,45 @@ def _fix_zero_prices(self, df, interval, tz_exchange): else: df2.index = df2.index.tz_convert(tz_exchange) - data_cols = [c for c in ["Open", "High", "Low", "Close"] if c in df2.columns] - f_zero_or_nan = (df2[data_cols] == 0.0).values.any(axis=1) | df2[data_cols].isna().values.any(axis=1) - f_fixable = (df2[[c for c in ["Close","Volume","Dividends","Stock Splits"] if c in df2.columns]]>0).any(axis=1) - f_repair = f_zero_or_nan & f_fixable - - n_fixed = 0 - data_cols += ["Adj Close", "Volume"] - for i in _np.where(f_repair)[0]: - idx = df2.index[i] - df_row = df2.loc[idx, data_cols] - bad_fields = df_row.index[(df_row.values==0.0)|df_row.isna().values].values - new_values = self._reconstruct_interval(df2.loc[idx], interval, bad_fields) - if not new_values is None: - for k in new_values: - df2.loc[idx, k] = new_values[k] - n_fixed += 1 + price_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df2.columns] + f_zero_or_nan = (df2[price_cols] == 0.0).values | df2[price_cols].isna().values + # Check whether worth attempting repair + if f_zero_or_nan.any(axis=1).sum() == 0: + return df + if f_zero_or_nan.sum() == len(price_cols)*len(df2): + # Need some good data to calibrate + return df + # - avoid repair if many zeroes/NaNs + pct_zero_or_nan = f_zero_or_nan.sum() / (len(price_cols)*len(df2)) + if f_zero_or_nan.any(axis=1).sum()>2 and pct_zero_or_nan > 0.05: + return df + data_cols = price_cols + ["Volume"] + + # Mark values to send for repair + tag = -1.0 + for i in range(len(price_cols)): + c = price_cols[i] + df2.loc[f_zero_or_nan[:,i], c] = tag + # If volume=0 or NaN for bad prices, then tag volume for repair + df2.loc[f_zero_or_nan.any(axis=1) & (df2["Volume"]==0), "Volume"] = tag + df2.loc[f_zero_or_nan.any(axis=1) & (df2["Volume"].isna()), "Volume"] = tag + + n_before = (df2[data_cols].to_numpy()==tag).sum() + df2 = self._reconstruct_intervals_batch(df2, interval, tag=tag) + n_after = (df2[data_cols].to_numpy()==tag).sum() + n_fixed = n_before - n_after if n_fixed > 0: print("{}: fixed {} price=0.0 errors in {} price data".format(self.ticker, n_fixed, interval)) + + # Restore original values where repair failed (i.e. remove tag values) + f = df2[data_cols].values==tag + for j in range(len(data_cols)): + fj = f[:,j] + if fj.any(): + c = data_cols[j] + df2.loc[fj, c] = df.loc[fj, c] + return df2 def _get_ticker_tz(self, debug_mode, proxy, timeout): diff --git a/yfinance/utils.py b/yfinance/utils.py index df6cdbec7..8ea821f52 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -22,6 +22,7 @@ from __future__ import print_function import datetime as _datetime +import dateutil as _dateutil from typing import Dict, Union, List, Optional import pytz as _tz @@ -300,6 +301,15 @@ def _parse_user_dt(dt, exchange_tz): return dt +def _interval_to_timedelta(interval): + if interval == "1mo": + return _dateutil.relativedelta(months=1) + elif interval == "1wk": + return _pd.Timedelta(days=7, unit='d') + else: + return _pd.Timedelta(interval) + + def auto_adjust(data): col_order = data.columns df = data.copy() From 85ef53c6bb39aa554a34bbc4a9bf3d26bb7d2201 Mon Sep 17 00:00:00 2001 From: ymyke Date: Tue, 13 Dec 2022 08:27:12 +0100 Subject: [PATCH 39/41] Store `_history_metadata` earlier and use that attribute for further metadata access in the same function --- yfinance/base.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index b92fe767f..3ce141a58 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -208,6 +208,12 @@ def history(self, period="1mo", interval="1d", except Exception: pass + # Store the meta data that gets retrieved simultaneously + try: + self._history_metadata = data["chart"]["result"][0]["meta"] + except KeyError: + self._history_metadata = {} + err_msg = "No data found for this date range, symbol may be delisted" fail = False if data is None or not type(data) is dict: @@ -221,9 +227,9 @@ def history(self, period="1mo", interval="1d", elif "chart" not in data or data["chart"]["result"] is None or not data["chart"]["result"]: fail = True elif period is not None and "timestamp" not in data["chart"]["result"][0] and period not in \ - data["chart"]["result"][0]["meta"]["validRanges"]: + self._history_metadata["validRanges"]: # User provided a bad period. The minimum should be '1d', but sometimes Yahoo accepts '1h'. - err_msg = "Period '{}' is invalid, must be one of {}".format(period, data["chart"]["result"][0]["meta"][ + err_msg = "Period '{}' is invalid, must be one of {}".format(period, self._history_metadata[ "validRanges"]) fail = True if fail: @@ -236,12 +242,6 @@ def history(self, period="1mo", interval="1d", print('%s: %s' % (self.ticker, err_msg)) return utils.empty_df() - # Store the meta data that gets retrieved simultaneously: - try: - self._history_metadata = data["chart"]["result"][0]["meta"] - except KeyError: - self._history_metadata = {} - # parse quotes try: quotes = utils.parse_quotes(data["chart"]["result"][0]) @@ -281,9 +281,9 @@ def history(self, period="1mo", interval="1d", pass # Select useful info from metadata - quote_type = data["chart"]["result"][0]["meta"]["instrumentType"] + quote_type = self._history_metadata["instrumentType"] expect_capital_gains = quote_type in ('MUTUALFUND', 'ETF') - tz_exchange = data["chart"]["result"][0]["meta"]["exchangeTimezoneName"] + tz_exchange = self._history_metadata["exchangeTimezoneName"] # Note: ordering is important. If you change order, run the tests! quotes = utils.set_df_tz(quotes, params["interval"], tz_exchange) From 6dca1eea968a0daa3ed69b8f020cb8de35c18ff2 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Tue, 13 Dec 2022 14:47:27 +0000 Subject: [PATCH 40/41] Don't repair prices if can't calibrate --- yfinance/base.py | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 8a095beef..960a95cd1 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -545,27 +545,29 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): # If different, then adjust to match 'df' df_block_calib = df_block[price_cols] calib_filter = df_block_calib.to_numpy() != tag - if calib_filter.any(): - df_new_calib = df_new[df_new.index.isin(df_block_calib.index)][price_cols] - ratios = (df_block_calib[price_cols].to_numpy() / df_new_calib[price_cols].to_numpy())[calib_filter] - ratio = _np.mean(ratios) - # - ratio_rcp = round(1.0 / ratio, 1) - ratio = round(ratio, 1) - if ratio == 1 and ratio_rcp == 1: - # Good! - pass - else: - if ratio > 1: - # data has different split-adjustment than fine-grained data - # Adjust fine-grained to match - df_new[price_cols] *= ratio - df_new["Volume"] /= ratio - elif ratio_rcp > 1: - # data has different split-adjustment than fine-grained data - # Adjust fine-grained to match - df_new[price_cols] *= 1.0 / ratio_rcp - df_new["Volume"] *= ratio_rcp + if not calib_filter.any(): + # Can't calibrate so don't attempt repair + continue + df_new_calib = df_new[df_new.index.isin(df_block_calib.index)][price_cols] + ratios = (df_block_calib[price_cols].to_numpy() / df_new_calib[price_cols].to_numpy())[calib_filter] + ratio = _np.mean(ratios) + # + ratio_rcp = round(1.0 / ratio, 1) + ratio = round(ratio, 1) + if ratio == 1 and ratio_rcp == 1: + # Good! + pass + else: + if ratio > 1: + # data has different split-adjustment than fine-grained data + # Adjust fine-grained to match + df_new[price_cols] *= ratio + df_new["Volume"] /= ratio + elif ratio_rcp > 1: + # data has different split-adjustment than fine-grained data + # Adjust fine-grained to match + df_new[price_cols] *= 1.0 / ratio_rcp + df_new["Volume"] *= ratio_rcp # Repair! bad_dts = df_block.index[(df_block[price_cols]==tag).any(axis=1)] From 11a3a9d457d2900dd262c7bc675c8b4aeb02c057 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Sat, 10 Dec 2022 18:16:44 +0000 Subject: [PATCH 41/41] Raise min lxml & pandas, sync all reqs lists --- README.md | 15 +++++++++------ meta.yaml | 26 +++++++++++++++++--------- requirements.txt | 4 ++-- setup.py | 4 ++-- 4 files changed, 30 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index b5746c3b7..365a26ccc 100644 --- a/README.md +++ b/README.md @@ -294,12 +294,15 @@ To install `yfinance` using `conda`, see ### Requirements - [Python](https://www.python.org) \>= 2.7, 3.4+ -- [Pandas](https://github.com/pydata/pandas) (tested to work with - \>=0.23.1) -- [Numpy](http://www.numpy.org) \>= 1.11.1 -- [requests](http://docs.python-requests.org/en/master/) \>= 2.14.2 -- [lxml](https://pypi.org/project/lxml/) \>= 4.5.1 -- [appdirs](https://pypi.org/project/appdirs) \>=1.4.4 +- [Pandas](https://github.com/pydata/pandas) \>= 1.3.0 +- [Numpy](http://www.numpy.org) \>= 1.16.5 +- [requests](http://docs.python-requests.org/en/master) \>= 2.26 +- [lxml](https://pypi.org/project/lxml) \>= 4.9.1 +- [appdirs](https://pypi.org/project/appdirs) \>= 1.4.4 +- [pytz](https://pypi.org/project/pytz) \>=2022.5 +- [frozendict](https://pypi.org/project/frozendict) \>= 2.3.4 +- [beautifulsoup4](https://pypi.org/project/beautifulsoup4) \>= 4.11.1 +- [html5lib](https://pypi.org/project/html5lib) \>= 1.1 ### Optional (if you want to use `pandas_datareader`) diff --git a/meta.yaml b/meta.yaml index 9d97dcfc2..6d3345fdf 100644 --- a/meta.yaml +++ b/meta.yaml @@ -1,5 +1,5 @@ {% set name = "yfinance" %} -{% set version = "0.1.58" %} +{% set version = "0.2.0" %} package: name: "{{ name|lower }}" @@ -16,22 +16,30 @@ build: requirements: host: - - pandas >=0.24.0 + - pandas >=1.3.0 - numpy >=1.16.5 - - requests >=2.21 + - requests >=2.26 - multitasking >=0.0.7 - - lxml >=4.5.1 - - appdirs >= 1.4.4 + - lxml >=4.9.1 + - appdirs >=1.4.4 + - pytz >=2022.5 + - frozendict >=2.3.4 + - beautifulsoup4 >=4.11.1 + - html5lib >=1.1 - pip - python run: - - pandas >=0.24.0 + - pandas >=1.3.0 - numpy >=1.16.5 - - requests >=2.21 + - requests >=2.26 - multitasking >=0.0.7 - - lxml >=4.5.1 - - appdirs >= 1.4.4 + - lxml >=4.9.1 + - appdirs >=1.4.4 + - pytz >=2022.5 + - frozendict >=2.3.4 + - beautifulsoup4 >=4.11.1 + - html5lib >=1.1 - python test: diff --git a/requirements.txt b/requirements.txt index d63896ebb..5f467b322 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -pandas>=1.1.0 +pandas>=1.3.0 numpy>=1.16.5 requests>=2.26 multitasking>=0.0.7 -lxml>=4.5.1 +lxml>=4.9.1 appdirs>=1.4.4 pytz>=2022.5 frozendict>=2.3.4 diff --git a/setup.py b/setup.py index fac4f1123..3261b09ef 100644 --- a/setup.py +++ b/setup.py @@ -59,9 +59,9 @@ platforms=['any'], keywords='pandas, yahoo finance, pandas datareader', packages=find_packages(exclude=['contrib', 'docs', 'tests', 'examples']), - install_requires=['pandas>=1.1.0', 'numpy>=1.15', + install_requires=['pandas>=1.3.0', 'numpy>=1.16.5', 'requests>=2.26', 'multitasking>=0.0.7', - 'lxml>=4.5.1', 'appdirs>=1.4.4', 'pytz>=2022.5', + 'lxml>=4.9.1', 'appdirs>=1.4.4', 'pytz>=2022.5', 'frozendict>=2.3.4', 'beautifulsoup4>=4.11.1', 'html5lib>=1.1'], entry_points={