From 7d45a6709a8371c9763ad3d276bbfc6137891cf9 Mon Sep 17 00:00:00 2001
From: ValueRaider <ValueRaider@protonmail.com>
Date: Sun, 2 Oct 2022 18:20:11 +0100
Subject: [PATCH 1/3] Fix merging of dividends/splits with prices

---
 yfinance/base.py  |  35 +++++++++----
 yfinance/utils.py | 125 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 149 insertions(+), 11 deletions(-)

diff --git a/yfinance/base.py b/yfinance/base.py
index 6885d374b..1471f2260 100644
--- a/yfinance/base.py
+++ b/yfinance/base.py
@@ -288,13 +288,32 @@ def history(self, period="1mo", interval="1d",
 
         tz_exchange = data["chart"]["result"][0]["meta"]["exchangeTimezoneName"]
 
-        # combine
-        df = _pd.concat([quotes, dividends, splits], axis=1, sort=True)
-        df["Dividends"].fillna(0, inplace=True)
-        df["Stock Splits"].fillna(0, inplace=True)
+        # prepare index for combine:
+        quotes.index = quotes.index.tz_localize("UTC").tz_convert(tz_exchange)
+        splits.index = splits.index.tz_localize("UTC").tz_convert(tz_exchange)
+        dividends.index = dividends.index.tz_localize("UTC").tz_convert(tz_exchange)
+        if params["interval"] in ["1d","1w","1wk","1mo","3mo"]:
+            # Converting datetime->date should improve merge performance
+            quotes.index = _pd.to_datetime(quotes.index.date)
+            splits.index = _pd.to_datetime(splits.index.date)
+            dividends.index = _pd.to_datetime(dividends.index.date)
 
-        # index eod/intraday
-        df.index = df.index.tz_localize("UTC").tz_convert(tz_exchange)
+        # combine
+        df = quotes
+        if actions:
+            df = df.sort_index()
+            if dividends.shape[0] > 0:
+                df = utils.safe_merge_dfs(df, dividends, interval)
+            if "Dividends" in df.columns:
+                df.loc[df["Dividends"].isna(),"Dividends"] = 0
+            else:
+                df["Dividends"] = 0.0
+            if splits.shape[0] > 0:
+                df = utils.safe_merge_dfs(df, splits, interval)
+            if "Stock Splits" in df.columns:
+                df.loc[df["Stock Splits"].isna(),"Stock Splits"] = 0
+            else:
+                df["Stock Splits"] = 0.0
 
         df = utils.fix_Yahoo_dst_issue(df, params["interval"])
             
@@ -303,7 +322,6 @@ def history(self, period="1mo", interval="1d",
         elif params["interval"] == "1h":
             pass
         else:
-            df.index = _pd.to_datetime(df.index.date).tz_localize(tz_exchange)
             df.index.name = "Date"
 
         # duplicates and missing rows cleanup
@@ -312,9 +330,6 @@ def history(self, period="1mo", interval="1d",
 
         self._history = df.copy()
 
-        if not actions:
-            df.drop(columns=["Dividends", "Stock Splits"], inplace=True)
-
         return df
 
     # ------------------------
diff --git a/yfinance/utils.py b/yfinance/utils.py
index d611575c3..1be82702a 100644
--- a/yfinance/utils.py
+++ b/yfinance/utils.py
@@ -247,11 +247,134 @@ def parse_actions(data):
             splits.sort_index(inplace=True)
             splits["Stock Splits"] = splits["numerator"] / \
                 splits["denominator"]
-            splits = splits["Stock Splits"]
+            splits = splits[["Stock Splits"]]
 
     return dividends, splits
 
 
+def safe_merge_dfs(df_main, df_sub, interval):
+    # Carefully merge 'df_sub' onto 'df_main'
+    # If naive merge fails, try again with reindexing df_sub:
+    # 1) if interval is weekly or monthly, then try with index set to start of week/month
+    # 2) if still failing then manually search through df_main.index to reindex df_sub
+
+    if df_sub.shape[0] == 0:
+        raise Exception("No data to merge")
+    
+    df_sub_backup = df_sub.copy()
+    data_cols = [c for c in df_sub.columns if not c in df_main]
+    if len(data_cols) > 1:
+        raise Exception("Expected 1 data col")
+    data_col = data_cols[0]
+
+    def _reindex_events(df, new_index, data_col_name):
+        if len(new_index) == len(set(new_index)):
+            # No duplicates, easy
+            df.index = new_index
+            return df
+
+        df["_NewIndex"] = new_index
+        # Duplicates present within periods but can aggregate
+        if data_col_name == "Dividends":
+            # Add
+            df = df.groupby("_NewIndex").sum()
+            df.index.name = None
+        elif data_col_name == "Stock Splits":
+            # Product
+            df = df.groupby("_NewIndex").prod()
+            df.index.name = None
+        else:
+            raise Exception("New index contains duplicates but unsure how to aggregate for '{}'".format(data_col_name))
+        if "_NewIndex" in df.columns:
+            df = df.drop("_NewIndex",axis=1)
+        return df
+
+    # Discard last row in 'df_sub' if significantly after last row in df_main.
+    # Size of difference depends on interval.
+    df_sub = df_sub[df_sub.index >= df_main.index[0]]
+    df_sub_last_dt = df_sub.index[-1]
+    df_main_last_dt = df_main.index[-1]
+    if df_sub_last_dt > df_main_last_dt:
+        if interval == "1mo" and df_sub_last_dt.month != df_main_last_dt.month:
+            df_sub = df_sub.drop(df_sub.index[-1])
+        elif interval in ["1wk","5d"] and df_sub_last_dt.week != df_main_last_dt.week:
+            df_sub = df_sub.drop(df_sub.index[-1])
+        elif interval == "1d" and df_sub_last_dt.date() > df_main_last_dt.date():
+            df_sub = df_sub.drop(df_sub.index[-1])
+        elif (interval.endswith('h') or interval.endswith('m')) and (df_sub_last_dt.date() > df_main_last_dt.date()):
+            df_sub = df_sub.drop(df_sub.index[-1])
+        if df_sub.shape[0] == 0:
+            # raise Exception("No data to merge after pruning out-of-range")
+            return df_main
+
+    df = df_main.join(df_sub)
+
+    f_na = df[data_col].isna()
+    data_lost = sum(~f_na) < df_sub.shape[0]
+    if not data_lost:
+        return df
+    # Lost data during join()
+    if interval in ["1wk","1mo","3mo"]:
+        # Backdate all df_sub.index dates to start of week/month
+        if interval == "1wk":
+            new_index = _pd.PeriodIndex(df_sub.index, freq='W').to_timestamp()
+        elif interval == "1mo":
+            new_index = _pd.PeriodIndex(df_sub.index, freq='M').to_timestamp()
+        elif interval == "3mo":
+            new_index = _pd.PeriodIndex(df_sub.index, freq='Q').to_timestamp()
+        df_sub = _reindex_events(df_sub, new_index, data_col)
+        df = df_main.join(df_sub)
+
+    f_na = df[data_col].isna()
+    data_lost = sum(~f_na) < df_sub.shape[0]
+    if not data_lost:
+        return df
+    # Lost data during join(). Manually check each df_sub.index date against df_main.index to
+    # find matching interval
+    df_sub = df_sub_backup.copy()
+    new_index = [-1]*df_sub.shape[0]
+    for i in range(df_sub.shape[0]):
+        dt_sub_i = df_sub.index[i]
+        if dt_sub_i in df_main.index:
+            new_index[i] = dt_sub_i ; continue
+        # Found a bad index date, need to search for near-match in df_main (same week/month)
+        fixed = False
+        for j in range(df_main.shape[0]-1):
+            dt_main_j0 = df_main.index[j]
+            dt_main_j1 = df_main.index[j+1]
+            if (dt_main_j0 <= dt_sub_i) and (dt_sub_i < dt_main_j1):
+                dt_sub_i = dt_main_j0 ; fixed = True ; break
+        if not fixed:
+            last_main_dt = df_main.index[df_main.shape[0]-1]
+            diff = dt_sub_i - last_main_dt
+            if interval == "1mo" and last_main_dt.month == dt_sub_i.month:
+                dt_sub_i = last_main_dt ; fixed = True
+            elif interval == "3mo" and last_main_dt.year == dt_sub_i.year and last_main_dt.quarter == dt_sub_i.quarter:
+                dt_sub_i = last_main_dt ; fixed = True
+            elif interval == "1wk" and last_main_dt.week == dt_sub_i.week:
+                dt_sub_i = last_main_dt ; fixed = True
+            elif interval == "1d" and last_main_dt.day == dt_sub_i.day:
+                dt_sub_i = last_main_dt ; fixed = True
+            elif interval == "1h" and last_main_dt.hour == dt_sub_i.hour:
+                dt_sub_i = last_main_dt ; fixed = True
+            else:
+                td = _pd.to_timedelta(interval)
+                if (dt_sub_i-last_main_dt) < td:
+                    dt_sub_i = last_main_dt ; fixed = True
+        if not fixed:
+            raise Exception("df_sub table contains row that failed to map to row in main table")
+        new_index[i] = dt_sub_i
+    df_sub = _reindex_events(df_sub, new_index, data_col)
+    df = df_main.join(df_sub)
+
+    f_na = df[data_col].isna()
+    data_lost = sum(~f_na) < df_sub.shape[0]
+    if data_lost:
+        raise Exception("Lost data during merge despite all attempts to align data")
+
+    return df
+
+
 def fix_Yahoo_dst_issue(df, interval):
     if interval in ["1d","1w","1wk"]:
         # These intervals should start at time 00:00. But for some combinations of date and timezone, 

From 1c85433cc0a75cf1ec18e3ce3ef00c5a937da0fd Mon Sep 17 00:00:00 2001
From: ValueRaider <ValueRaider@protonmail.com>
Date: Mon, 10 Oct 2022 13:58:17 +0100
Subject: [PATCH 2/3] Add unittest for div/splits merging

---
 tests/__init__.py |  1 +
 tests/context.py  |  9 ++++++++
 tests/prices.py   | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/context.py
 create mode 100644 tests/prices.py

diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000..fef66b5a2
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+#!/usr/bin/env python
\ No newline at end of file
diff --git a/tests/context.py b/tests/context.py
new file mode 100644
index 000000000..fe647f890
--- /dev/null
+++ b/tests/context.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+_parent_dp = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+_src_dp = _parent_dp
+sys.path.insert(0, _src_dp)
+
+import yfinance
diff --git a/tests/prices.py b/tests/prices.py
new file mode 100644
index 000000000..35a82eefd
--- /dev/null
+++ b/tests/prices.py
@@ -0,0 +1,52 @@
+from .context import yfinance as yf
+
+import unittest
+
+class TestPriceHistory(unittest.TestCase):
+	def setUp(self):
+		pass
+
+	def tearDown(self):
+		pass
+
+	def test_weeklyWithEvents(self):
+		# Reproduce issue #521
+		tkr1 = "QQQ"
+		tkr2 = "GDX"
+		start_d = "2014-12-29"
+		end_d = "2020-11-29"
+		df1 = yf.Ticker(tkr1).history(start=start_d, end=end_d, interval="1wk", actions=True)
+		df2 = yf.Ticker(tkr2).history(start=start_d, end=end_d, interval="1wk", actions=True)
+		try:
+			self.assertTrue(df1.index.equals(df2.index))
+		except:
+			missing_from_df1 = df2.index.difference(df1.index)
+			missing_from_df2 = df1.index.difference(df2.index)
+			print("{} missing these dates: {}".format(tkr1, missing_from_df1))
+			print("{} missing these dates: {}".format(tkr2, missing_from_df2))
+			raise
+
+		# Test that index same with and without events:
+		tkrs = [tkr1, tkr2]
+		for tkr in tkrs:
+			df1 = yf.Ticker(tkr).history(start=start_d, end=end_d, interval="1wk", actions=True)
+			df2 = yf.Ticker(tkr).history(start=start_d, end=end_d, interval="1wk", actions=False)
+			try:
+				self.assertTrue(df1.index.equals(df2.index))
+			except:
+				missing_from_df1 = df2.index.difference(df1.index)
+				missing_from_df2 = df1.index.difference(df2.index)
+				print("{}-with-events missing these dates: {}".format(tkr, missing_from_df1))
+				print("{}-without-events missing these dates: {}".format(tkr, missing_from_df2))
+				raise
+
+if __name__ == '__main__':
+	unittest.main()
+
+	# Run tests sequentially:
+	import inspect
+	test_src = inspect.getsource(TestPriceHistory)
+	unittest.TestLoader.sortTestMethodsUsing = lambda _, x, y: (
+		test_src.index(f"def {x}") - test_src.index(f"def {y}")
+	)
+	unittest.main(verbosity=2)

From a72458555260696b2593d4f3dec76b5cf1fdc586 Mon Sep 17 00:00:00 2001
From: ValueRaider <ValueRaider@protonmail.com>
Date: Mon, 10 Oct 2022 14:00:10 +0100
Subject: [PATCH 3/3] Tidy syntax

---
 tests/__init__.py | 2 +-
 tests/prices.py   | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index fef66b5a2..4265cc3e6 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1 +1 @@
-#!/usr/bin/env python
\ No newline at end of file
+#!/usr/bin/env python
diff --git a/tests/prices.py b/tests/prices.py
index 35a82eefd..239e62db9 100644
--- a/tests/prices.py
+++ b/tests/prices.py
@@ -42,11 +42,3 @@ def test_weeklyWithEvents(self):
 
 if __name__ == '__main__':
 	unittest.main()
-
-	# Run tests sequentially:
-	import inspect
-	test_src = inspect.getsource(TestPriceHistory)
-	unittest.TestLoader.sortTestMethodsUsing = lambda _, x, y: (
-		test_src.index(f"def {x}") - test_src.index(f"def {y}")
-	)
-	unittest.main(verbosity=2)