From 7d45a6709a8371c9763ad3d276bbfc6137891cf9 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Sun, 2 Oct 2022 18:20:11 +0100 Subject: [PATCH 1/3] Fix merging of dividends/splits with prices --- yfinance/base.py | 35 +++++++++---- yfinance/utils.py | 125 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 11 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 6885d374b..1471f2260 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -288,13 +288,32 @@ def history(self, period="1mo", interval="1d", tz_exchange = data["chart"]["result"][0]["meta"]["exchangeTimezoneName"] - # combine - df = _pd.concat([quotes, dividends, splits], axis=1, sort=True) - df["Dividends"].fillna(0, inplace=True) - df["Stock Splits"].fillna(0, inplace=True) + # prepare index for combine: + quotes.index = quotes.index.tz_localize("UTC").tz_convert(tz_exchange) + splits.index = splits.index.tz_localize("UTC").tz_convert(tz_exchange) + dividends.index = dividends.index.tz_localize("UTC").tz_convert(tz_exchange) + if params["interval"] in ["1d","1w","1wk","1mo","3mo"]: + # Converting datetime->date should improve merge performance + quotes.index = _pd.to_datetime(quotes.index.date) + splits.index = _pd.to_datetime(splits.index.date) + dividends.index = _pd.to_datetime(dividends.index.date) - # index eod/intraday - df.index = df.index.tz_localize("UTC").tz_convert(tz_exchange) + # combine + df = quotes + if actions: + df = df.sort_index() + if dividends.shape[0] > 0: + df = utils.safe_merge_dfs(df, dividends, interval) + if "Dividends" in df.columns: + df.loc[df["Dividends"].isna(),"Dividends"] = 0 + else: + df["Dividends"] = 0.0 + if splits.shape[0] > 0: + df = utils.safe_merge_dfs(df, splits, interval) + if "Stock Splits" in df.columns: + df.loc[df["Stock Splits"].isna(),"Stock Splits"] = 0 + else: + df["Stock Splits"] = 0.0 df = utils.fix_Yahoo_dst_issue(df, params["interval"]) @@ -303,7 +322,6 @@ def history(self, period="1mo", interval="1d", elif params["interval"] == "1h": pass else: - df.index = _pd.to_datetime(df.index.date).tz_localize(tz_exchange) df.index.name = "Date" # duplicates and missing rows cleanup @@ -312,9 +330,6 @@ def history(self, period="1mo", interval="1d", self._history = df.copy() - if not actions: - df.drop(columns=["Dividends", "Stock Splits"], inplace=True) - return df # ------------------------ diff --git a/yfinance/utils.py b/yfinance/utils.py index d611575c3..1be82702a 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -247,11 +247,134 @@ def parse_actions(data): splits.sort_index(inplace=True) splits["Stock Splits"] = splits["numerator"] / \ splits["denominator"] - splits = splits["Stock Splits"] + splits = splits[["Stock Splits"]] return dividends, splits +def safe_merge_dfs(df_main, df_sub, interval): + # Carefully merge 'df_sub' onto 'df_main' + # If naive merge fails, try again with reindexing df_sub: + # 1) if interval is weekly or monthly, then try with index set to start of week/month + # 2) if still failing then manually search through df_main.index to reindex df_sub + + if df_sub.shape[0] == 0: + raise Exception("No data to merge") + + df_sub_backup = df_sub.copy() + data_cols = [c for c in df_sub.columns if not c in df_main] + if len(data_cols) > 1: + raise Exception("Expected 1 data col") + data_col = data_cols[0] + + def _reindex_events(df, new_index, data_col_name): + if len(new_index) == len(set(new_index)): + # No duplicates, easy + df.index = new_index + return df + + df["_NewIndex"] = new_index + # Duplicates present within periods but can aggregate + if data_col_name == "Dividends": + # Add + df = df.groupby("_NewIndex").sum() + df.index.name = None + elif data_col_name == "Stock Splits": + # Product + df = df.groupby("_NewIndex").prod() + df.index.name = None + else: + raise Exception("New index contains duplicates but unsure how to aggregate for '{}'".format(data_col_name)) + if "_NewIndex" in df.columns: + df = df.drop("_NewIndex",axis=1) + return df + + # Discard last row in 'df_sub' if significantly after last row in df_main. + # Size of difference depends on interval. + df_sub = df_sub[df_sub.index >= df_main.index[0]] + df_sub_last_dt = df_sub.index[-1] + df_main_last_dt = df_main.index[-1] + if df_sub_last_dt > df_main_last_dt: + if interval == "1mo" and df_sub_last_dt.month != df_main_last_dt.month: + df_sub = df_sub.drop(df_sub.index[-1]) + elif interval in ["1wk","5d"] and df_sub_last_dt.week != df_main_last_dt.week: + df_sub = df_sub.drop(df_sub.index[-1]) + elif interval == "1d" and df_sub_last_dt.date() > df_main_last_dt.date(): + df_sub = df_sub.drop(df_sub.index[-1]) + elif (interval.endswith('h') or interval.endswith('m')) and (df_sub_last_dt.date() > df_main_last_dt.date()): + df_sub = df_sub.drop(df_sub.index[-1]) + if df_sub.shape[0] == 0: + # raise Exception("No data to merge after pruning out-of-range") + return df_main + + df = df_main.join(df_sub) + + f_na = df[data_col].isna() + data_lost = sum(~f_na) < df_sub.shape[0] + if not data_lost: + return df + # Lost data during join() + if interval in ["1wk","1mo","3mo"]: + # Backdate all df_sub.index dates to start of week/month + if interval == "1wk": + new_index = _pd.PeriodIndex(df_sub.index, freq='W').to_timestamp() + elif interval == "1mo": + new_index = _pd.PeriodIndex(df_sub.index, freq='M').to_timestamp() + elif interval == "3mo": + new_index = _pd.PeriodIndex(df_sub.index, freq='Q').to_timestamp() + df_sub = _reindex_events(df_sub, new_index, data_col) + df = df_main.join(df_sub) + + f_na = df[data_col].isna() + data_lost = sum(~f_na) < df_sub.shape[0] + if not data_lost: + return df + # Lost data during join(). Manually check each df_sub.index date against df_main.index to + # find matching interval + df_sub = df_sub_backup.copy() + new_index = [-1]*df_sub.shape[0] + for i in range(df_sub.shape[0]): + dt_sub_i = df_sub.index[i] + if dt_sub_i in df_main.index: + new_index[i] = dt_sub_i ; continue + # Found a bad index date, need to search for near-match in df_main (same week/month) + fixed = False + for j in range(df_main.shape[0]-1): + dt_main_j0 = df_main.index[j] + dt_main_j1 = df_main.index[j+1] + if (dt_main_j0 <= dt_sub_i) and (dt_sub_i < dt_main_j1): + dt_sub_i = dt_main_j0 ; fixed = True ; break + if not fixed: + last_main_dt = df_main.index[df_main.shape[0]-1] + diff = dt_sub_i - last_main_dt + if interval == "1mo" and last_main_dt.month == dt_sub_i.month: + dt_sub_i = last_main_dt ; fixed = True + elif interval == "3mo" and last_main_dt.year == dt_sub_i.year and last_main_dt.quarter == dt_sub_i.quarter: + dt_sub_i = last_main_dt ; fixed = True + elif interval == "1wk" and last_main_dt.week == dt_sub_i.week: + dt_sub_i = last_main_dt ; fixed = True + elif interval == "1d" and last_main_dt.day == dt_sub_i.day: + dt_sub_i = last_main_dt ; fixed = True + elif interval == "1h" and last_main_dt.hour == dt_sub_i.hour: + dt_sub_i = last_main_dt ; fixed = True + else: + td = _pd.to_timedelta(interval) + if (dt_sub_i-last_main_dt) < td: + dt_sub_i = last_main_dt ; fixed = True + if not fixed: + raise Exception("df_sub table contains row that failed to map to row in main table") + new_index[i] = dt_sub_i + df_sub = _reindex_events(df_sub, new_index, data_col) + df = df_main.join(df_sub) + + f_na = df[data_col].isna() + data_lost = sum(~f_na) < df_sub.shape[0] + if data_lost: + raise Exception("Lost data during merge despite all attempts to align data") + + return df + + def fix_Yahoo_dst_issue(df, interval): if interval in ["1d","1w","1wk"]: # These intervals should start at time 00:00. But for some combinations of date and timezone, From 1c85433cc0a75cf1ec18e3ce3ef00c5a937da0fd Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Mon, 10 Oct 2022 13:58:17 +0100 Subject: [PATCH 2/3] Add unittest for div/splits merging --- tests/__init__.py | 1 + tests/context.py | 9 ++++++++ tests/prices.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/context.py create mode 100644 tests/prices.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 000000000..fef66b5a2 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python \ No newline at end of file diff --git a/tests/context.py b/tests/context.py new file mode 100644 index 000000000..fe647f890 --- /dev/null +++ b/tests/context.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- + +import sys +import os +_parent_dp = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +_src_dp = _parent_dp +sys.path.insert(0, _src_dp) + +import yfinance diff --git a/tests/prices.py b/tests/prices.py new file mode 100644 index 000000000..35a82eefd --- /dev/null +++ b/tests/prices.py @@ -0,0 +1,52 @@ +from .context import yfinance as yf + +import unittest + +class TestPriceHistory(unittest.TestCase): + def setUp(self): + pass + + def tearDown(self): + pass + + def test_weeklyWithEvents(self): + # Reproduce issue #521 + tkr1 = "QQQ" + tkr2 = "GDX" + start_d = "2014-12-29" + end_d = "2020-11-29" + df1 = yf.Ticker(tkr1).history(start=start_d, end=end_d, interval="1wk", actions=True) + df2 = yf.Ticker(tkr2).history(start=start_d, end=end_d, interval="1wk", actions=True) + try: + self.assertTrue(df1.index.equals(df2.index)) + except: + missing_from_df1 = df2.index.difference(df1.index) + missing_from_df2 = df1.index.difference(df2.index) + print("{} missing these dates: {}".format(tkr1, missing_from_df1)) + print("{} missing these dates: {}".format(tkr2, missing_from_df2)) + raise + + # Test that index same with and without events: + tkrs = [tkr1, tkr2] + for tkr in tkrs: + df1 = yf.Ticker(tkr).history(start=start_d, end=end_d, interval="1wk", actions=True) + df2 = yf.Ticker(tkr).history(start=start_d, end=end_d, interval="1wk", actions=False) + try: + self.assertTrue(df1.index.equals(df2.index)) + except: + missing_from_df1 = df2.index.difference(df1.index) + missing_from_df2 = df1.index.difference(df2.index) + print("{}-with-events missing these dates: {}".format(tkr, missing_from_df1)) + print("{}-without-events missing these dates: {}".format(tkr, missing_from_df2)) + raise + +if __name__ == '__main__': + unittest.main() + + # Run tests sequentially: + import inspect + test_src = inspect.getsource(TestPriceHistory) + unittest.TestLoader.sortTestMethodsUsing = lambda _, x, y: ( + test_src.index(f"def {x}") - test_src.index(f"def {y}") + ) + unittest.main(verbosity=2) From a72458555260696b2593d4f3dec76b5cf1fdc586 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Mon, 10 Oct 2022 14:00:10 +0100 Subject: [PATCH 3/3] Tidy syntax --- tests/__init__.py | 2 +- tests/prices.py | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index fef66b5a2..4265cc3e6 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +1 @@ -#!/usr/bin/env python \ No newline at end of file +#!/usr/bin/env python diff --git a/tests/prices.py b/tests/prices.py index 35a82eefd..239e62db9 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -42,11 +42,3 @@ def test_weeklyWithEvents(self): if __name__ == '__main__': unittest.main() - - # Run tests sequentially: - import inspect - test_src = inspect.getsource(TestPriceHistory) - unittest.TestLoader.sortTestMethodsUsing = lambda _, x, y: ( - test_src.index(f"def {x}") - test_src.index(f"def {y}") - ) - unittest.main(verbosity=2)