diff --git a/tests/prices.py b/tests/prices.py index 052739eb4..23babb54a 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -251,244 +251,147 @@ def test_weekly_2rows_fix(self): df = dat.history(start=start, interval="1wk") self.assertTrue((df.index.weekday==0).all()) - - def test_repair_weekly(self): + def test_repair_weekly_100x(self): # Sometimes, Yahoo returns prices 100x the correct value. # Suspect mixup between £/pence or $/cents etc. # E.g. ticker PNL.L # Setup: tkr = "PNL.L" - error_threshold = 1000.0 - start = "2020-01-06" - end = min(_dt.date.today(), _dt.date(2023, 1, 1)) + dat = yf.Ticker(tkr, session=self.session) + tz_exchange = dat.info["exchangeTimezoneName"] + + data_cols = ["Low", "High", "Open", "Close", "Adj Close"] + df = _pd.DataFrame(data={"Open": [470.5, 473.5, 474.5, 470], + "High": [476, 476.5, 477, 480], + "Low": [470.5, 470, 465.5, 468.26], + "Close": [475, 473.5, 472, 473.5], + "Adj Close": [475, 473.5, 472, 473.5], + "Volume": [2295613, 2245604, 3000287, 2635611]}, + index=_pd.to_datetime([_dt.date(2022, 10, 23), + _dt.date(2022, 10, 16), + _dt.date(2022, 10, 9), + _dt.date(2022, 10, 2)])) + df.index.name = "Date" + df_bad = df.copy() + df_bad.loc["2022-10-23", "Close"] *= 100 + df_bad.loc["2022-10-16", "Low"] *= 100 + df_bad.loc["2022-10-2", "Open"] *= 100 + df.index = df.index.tz_localize(tz_exchange) + df_bad.index = df_bad.index.tz_localize(tz_exchange) # Run test - dat = yf.Ticker(tkr, session=self.session) - df_bad = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=False) + df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange) - # Record the errors that will be repaired - data_cols = ["Low", "High", "Open", "Close", "Adj Close"] - f_outlier = _np.where(df_bad[data_cols] > error_threshold) - indices = None - if len(f_outlier[0]) == 0: - self.skipTest("Skipping test_repair_weekly() because no price 100x errors to repair") - indices = [] - for i in range(len(f_outlier[0])): - indices.append((f_outlier[0][i], f_outlier[1][i])) - - df = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=True) - - # First test - no errors left after repair - df_data = df[data_cols].values - for i, j in indices: - try: - self.assertTrue(df_data[i, j] < error_threshold) - except: - print("Detected uncorrected error: idx={}, {}={}".format(df.index[i], data_cols[j], df_data[i, j])) - raise + # First test - no errors left + for c in data_cols: + self.assertTrue(_np.isclose(df_repaired[c], df[c], rtol=1e-2).all()) - # Second test - all differences between pre- and post-repair should be ~100x - ratio = (df_bad[data_cols].values / df[data_cols].values).round(2) - # - round near-100 ratios to 100: - f_near_100 = (ratio > 90) & (ratio < 110) - ratio[f_near_100] = (ratio[f_near_100] / 10).round().astype(int) * 10 # round ratio to nearest 10 + # Second test - all differences should be either ~1x or ~100x + ratio = df_bad[data_cols].values / df[data_cols].values + ratio = ratio.round(2) + # - round near-100 ratio to 100: + f = ratio > 90 + ratio[f] = (ratio[f] / 10).round().astype(int) * 10 # round ratio to nearest 10 # - now test f_100 = ratio == 100 f_1 = ratio == 1 self.assertTrue((f_100 | f_1).all()) - # Third test: compare directly against daily data, unadjusted - df = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=True) - for i in indices: - dt = df.index[i[0]] - - df_daily = dat.history(start=dt, end=dt + _dt.timedelta(days=7), interval="1d", auto_adjust=False, - repair=True) - - # Manually construct weekly price data from daily - df_yf_weekly = df_daily.copy() - df_yf_weekly["_weekStart"] = _pd.to_datetime( - df_yf_weekly.index.tz_localize(None).to_period('W-SUN').start_time).tz_localize(df.index.tz) - df_yf_weekly.loc[df_yf_weekly["Stock Splits"] == 0, "Stock Splits"] = 1 - df_yf_weekly = df_yf_weekly.groupby("_weekStart").agg( - Open=("Open", "first"), - Close=("Close", "last"), - AdjClose=("Adj Close", "last"), - Low=("Low", "min"), - High=("High", "max"), - Volume=("Volume", "sum"), - Dividends=("Dividends", "sum"), - StockSplits=("Stock Splits", "prod")).rename( - columns={"StockSplits": "Stock Splits", "AdjClose": "Adj Close"}) - df_yf_weekly.loc[df_yf_weekly["Stock Splits"] == 1, "Stock Splits"] = 0 - if df_yf_weekly.index[0] not in df_daily.index: - # Exchange closed Monday. In this case, Yahoo sets Open to last week close - df_daily_last_week = dat.history(start=dt - _dt.timedelta(days=7), end=dt, interval="1d", - auto_adjust=False, repair=True) - df_yf_weekly["Open"] = df_daily_last_week["Close"][-1] - df_yf_weekly["Low"] = _np.minimum(df_yf_weekly["Low"], df_yf_weekly["Open"]) - - # Compare fetched-weekly vs constructed-weekly: - df_yf_weekly = df_yf_weekly[df.columns] - try: - # Note: Adj Close has tiny variance depending on date range requested - data_cols = ["Open", "Close", "Low", "High"] - self.assertTrue(_np.equal(df.loc[dt, data_cols].values, df_yf_weekly[data_cols].iloc[0].values).all()) - self.assertLess(abs(df.loc[dt, "Adj Close"] / df_yf_weekly["Adj Close"].iloc[0] - 1.0), 0.000001) - except: - for c in df.columns: - if c == "Adj Close": - fail = abs(df.loc[dt, c] / df_yf_weekly[c].iloc[0] - 1.0) < 0.000001 - else: - fail = df.loc[dt, c] != df_yf_weekly[c].iloc[0] - if fail: - print("dt = ", dt) - print("df.loc[dt]:", type(df.loc[dt])) - print(df.loc[dt].to_dict()) - print("df_yf_weekly.iloc[0]:", type(df_yf_weekly.iloc[0])) - print(df_yf_weekly.iloc[0].to_dict()) - print("Result:", df.loc[dt, c]) - print("Answer:", df_yf_weekly[c].iloc[0]) - raise Exception("Mismatch in column '{}'".format(c)) - - def test_repair_weekly2_preSplit(self): + def test_repair_weekly_preSplit_100x(self): # Sometimes, Yahoo returns prices 100x the correct value. # Suspect mixup between £/pence or $/cents etc. # E.g. ticker PNL.L # PNL.L has a stock-split in 2022. Sometimes requesting data before 2022 is not split-adjusted. - # Setup: tkr = "PNL.L" - error_threshold = 1000.0 - start = "2020-01-06" - end = "2021-06-01" - - # Run test - dat = yf.Ticker(tkr, session=self.session) - df_bad = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=False) + tz_exchange = dat.info["exchangeTimezoneName"] - # Record the errors that will be repaired data_cols = ["Low", "High", "Open", "Close", "Adj Close"] - f_outlier = _np.where(df_bad[data_cols] > error_threshold) - indices = None - if len(f_outlier[0]) == 0: - self.skipTest("Skipping test_repair_weekly() because no price 100x errors to repair") - indices = [] - for i in range(len(f_outlier[0])): - indices.append((f_outlier[0][i], f_outlier[1][i])) - - df = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=True) - - # First test - no errors left after repair - df_data = df[data_cols].values - for i, j in indices: + df = _pd.DataFrame(data={"Open": [400, 398, 392.5, 417], + "High": [421, 425, 419, 420.5], + "Low": [400, 380.5, 376.5, 396], + "Close": [410, 409.5, 402, 399], + "Adj Close": [398.02, 397.53, 390.25, 387.34], + "Volume": [3232600, 3773900, 10835000, 4257900]}, + index=_pd.to_datetime([_dt.date(2020, 3, 30), + _dt.date(2020, 3, 23), + _dt.date(2020, 3, 16), + _dt.date(2020, 3, 9)])) + # Simulate data missing split-adjustment: + df[data_cols] *= 100.0 + df["Volume"] *= 0.01 + # + df.index.name = "Date" + # Create 100x errors: + df_bad = df.copy() + df_bad.loc["2020-03-30", "Close"] *= 100 + df_bad.loc["2020-03-23", "Low"] *= 100 + df_bad.loc["2020-03-09", "Open"] *= 100 + df.index = df.index.tz_localize(tz_exchange) + df_bad.index = df_bad.index.tz_localize(tz_exchange) + + df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange) + + # First test - no errors left + for c in data_cols: try: - self.assertTrue(df_data[i, j] < error_threshold) + self.assertTrue(_np.isclose(df_repaired[c], df[c], rtol=1e-2).all()) except: - print("Detected uncorrected error: idx={}, {}={}".format(df.index[i], data_cols[j], df_data[i, j])) + print("Mismatch in column", c) + print("- df_repaired:") + print(df_repaired[c]) + print("- answer:") + print(df[c]) raise - # Second test - all differences between pre- and post-repair should be ~100x - ratio = (df_bad[data_cols].values / df[data_cols].values).round(2) - # - round near-100 ratios to 100: - f_near_100 = (ratio > 90) & (ratio < 110) - ratio[f_near_100] = (ratio[f_near_100] / 10).round().astype(int) * 10 # round ratio to nearest 10 + # Second test - all differences should be either ~1x or ~100x + ratio = df_bad[data_cols].values / df[data_cols].values + ratio = ratio.round(2) + # - round near-100 ratio to 100: + f = ratio > 90 + ratio[f] = (ratio[f] / 10).round().astype(int) * 10 # round ratio to nearest 10 # - now test f_100 = ratio == 100 f_1 = ratio == 1 self.assertTrue((f_100 | f_1).all()) - # Third test: compare directly against daily data, unadjusted - df = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=True) - for i in indices: - dt = df.index[i[0]] - - df_daily = dat.history(start=dt, end=dt + _dt.timedelta(days=7), interval="1d", auto_adjust=False, - repair=True) - - # Manually construct weekly price data from daily - df_yf_weekly = df_daily.copy() - df_yf_weekly["_weekStart"] = _pd.to_datetime( - df_yf_weekly.index.tz_localize(None).to_period('W-SUN').start_time).tz_localize(df.index.tz) - df_yf_weekly.loc[df_yf_weekly["Stock Splits"] == 0, "Stock Splits"] = 1 - df_yf_weekly = df_yf_weekly.groupby("_weekStart").agg( - Open=("Open", "first"), - Close=("Close", "last"), - AdjClose=("Adj Close", "last"), - Low=("Low", "min"), - High=("High", "max"), - Volume=("Volume", "sum"), - Dividends=("Dividends", "sum"), - StockSplits=("Stock Splits", "prod")).rename( - columns={"StockSplits": "Stock Splits", "AdjClose": "Adj Close"}) - df_yf_weekly.loc[df_yf_weekly["Stock Splits"] == 1, "Stock Splits"] = 0 - if df_yf_weekly.index[0] not in df_daily.index: - # Exchange closed Monday. In this case, Yahoo sets Open to last week close - df_daily_last_week = dat.history(start=dt - _dt.timedelta(days=7), end=dt, interval="1d", - auto_adjust=False, repair=True) - df_yf_weekly["Open"] = df_daily_last_week["Close"][-1] - df_yf_weekly["Low"] = _np.minimum(df_yf_weekly["Low"], df_yf_weekly["Open"]) - - # Compare fetched-weekly vs constructed-weekly: - df_yf_weekly = df_yf_weekly[df.columns] - try: - # Note: Adj Close has tiny variance depending on date range requested - data_cols = ["Open", "Close", "Low", "High"] - self.assertTrue(_np.equal(df.loc[dt, data_cols].values, df_yf_weekly[data_cols].iloc[0].values).all()) - self.assertLess(abs(df.loc[dt, "Adj Close"] / df_yf_weekly["Adj Close"].iloc[0] - 1.0), 0.000001) - except: - for c in df.columns: - if c == "Adj Close": - fail = abs(df.loc[dt, c] / df_yf_weekly[c].iloc[0] - 1.0) < 0.000001 - else: - fail = df.loc[dt, c] != df_yf_weekly[c].iloc[0] - if fail: - print("dt = ", dt) - print("df.loc[dt]:", type(df.loc[dt])) - print(df.loc[dt].to_dict()) - print("df_yf_weekly.iloc[0]:", type(df_yf_weekly.iloc[0])) - print(df_yf_weekly.iloc[0].to_dict()) - print("Result:", df.loc[dt, c]) - print("Answer:", df_yf_weekly[c].iloc[0]) - raise Exception("Mismatch in column '{}'".format(c)) - - def test_repair_daily(self): + def test_repair_daily_100x(self): # Sometimes, Yahoo returns prices 100x the correct value. # Suspect mixup between £/pence or $/cents etc. # E.g. ticker PNL.L tkr = "PNL.L" - start = "2020-01-01" - end = min(_dt.date.today(), _dt.date(2023, 1, 1)) dat = yf.Ticker(tkr, session=self.session) + tz_exchange = dat.info["exchangeTimezoneName"] data_cols = ["Low", "High", "Open", "Close", "Adj Close"] - df_bad = dat.history(start=start, end=end, interval="1d", auto_adjust=False, repair=False) - f_outlier = _np.where(df_bad[data_cols] > 1000.0) - indices = None - if len(f_outlier[0]) == 0: - self.skipTest("Skipping test_repair_daily() because no price 100x errors to repair") - - # Outliers detected - indices = [] - for i in range(len(f_outlier[0])): - indices.append((f_outlier[0][i], f_outlier[1][i])) - - df = dat.history(start=start, end=end, interval="1d", auto_adjust=False, repair=True) + df = _pd.DataFrame(data={"Open": [478, 476, 476, 472], + "High": [478, 477.5, 477, 475], + "Low": [474.02, 474, 473, 470.75], + "Close": [475.5, 475.5, 474.5, 475], + "Adj Close": [475.5, 475.5, 474.5, 475], + "Volume": [436414, 485947, 358067, 287620]}, + index=_pd.to_datetime([_dt.date(2022, 11, 1), + _dt.date(2022, 10, 31), + _dt.date(2022, 10, 28), + _dt.date(2022, 10, 27)])) + df.index.name = "Date" + df_bad = df.copy() + df_bad.loc["2022-11-01", "Close"] *= 100 + df_bad.loc["2022-10-31", "Low"] *= 100 + df_bad.loc["2022-10-27", "Open"] *= 100 + df.index = df.index.tz_localize(tz_exchange) + df_bad.index = df_bad.index.tz_localize(tz_exchange) + + df_repaired = dat._fix_unit_mixups(df_bad, "1d", tz_exchange) # First test - no errors left - df_data = df[data_cols].values - for i, j in indices: - try: - self.assertTrue(df_data[i, j] < 1000.0) - except: - print("Detected uncorrected error: idx={}, {}={}".format(df.index[i], data_cols[j], df_data[i, j])) - # print(df.iloc[i-1:i+2]) - raise + for c in data_cols: + self.assertTrue(_np.isclose(df_repaired[c], df[c], rtol=1e-2).all()) # Second test - all differences should be either ~1x or ~100x ratio = df_bad[data_cols].values / df[data_cols].values @@ -501,6 +404,35 @@ def test_repair_daily(self): f_1 = ratio == 1 self.assertTrue((f_100 | f_1).all()) + def test_repair_daily_zeroes(self): + # Sometimes Yahoo returns price=0.0 when price obviously not zero + # E.g. ticker BBIL.L + + tkr = "BBIL.L" + dat = yf.Ticker(tkr, session=self.session) + tz_exchange = dat.info["exchangeTimezoneName"] + + df_bad = _pd.DataFrame(data={"Open": [0, 102.04, 102.04], + "High": [0, 102.1, 102.11], + "Low": [0, 102.04, 102.04], + "Close": [103.03, 102.05, 102.08], + "Adj Close": [102.03, 102.05, 102.08], + "Volume": [560, 137, 117]}, + index=_pd.to_datetime([_dt.datetime(2022, 11, 1), + _dt.datetime(2022, 10, 31), + _dt.datetime(2022, 10, 30)])) + df_bad.index.name = "Date" + df_bad.index = df_bad.index.tz_localize(tz_exchange) + + repaired_df = dat._fix_zero_prices(df_bad, "1d", tz_exchange) + + correct_df = df_bad.copy() + correct_df.loc[correct_df.index[0], "Open"] = 102.080002 + correct_df.loc[correct_df.index[0], "Low"] = 102.032501 + correct_df.loc[correct_df.index[0], "High"] = 102.080002 + for c in ["Open", "Low", "High", "Close"]: + self.assertTrue(_np.isclose(repaired_df[c], correct_df[c], rtol=1e-8).all()) + if __name__ == '__main__': unittest.main() diff --git a/yfinance/base.py b/yfinance/base.py index 77e2ae0f4..9ed891912 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -291,6 +291,7 @@ def history(self, period="1mo", interval="1d", quotes = utils.fix_Yahoo_returning_live_separate(quotes, params["interval"], tz_exchange) if repair: # Do this before auto/back adjust + quotes = self._fix_zero_prices(quotes, interval, tz_exchange) quotes = self._fix_unit_mixups(quotes, interval, tz_exchange) # Auto/back adjust @@ -372,6 +373,105 @@ def history(self, period="1mo", interval="1d", # ------------------------ + def _reconstruct_interval(self, df_row, interval, bad_fields): + if isinstance(df_row, _pd.DataFrame) or not isinstance(df_row, _pd.Series): + raise Exception("'df_row' must be a Pandas Series not", type(df_row)) + if not isinstance(bad_fields, (list,set,_np.ndarray)): + raise Exception("'bad_fields' must be a list/set not", type(bad_fields)) + + data_cols = [c for c in ["Open","High","Low","Close","Adj Close"] if c in df_row.index] + + # If interval is weekly then can construct with daily. But if smaller intervals then + # restricted to recent times: + # - daily = hourly restricted to last 730 days + sub_interval = None + td_range = None + if interval == "1wk": + # Correct by fetching week of daily data + sub_interval = "1d" + td_range = _datetime.timedelta(days=7) + elif interval == "1d": + # Correct by fetching day of hourly data + sub_interval = "1h" + td_range = _datetime.timedelta(days=1) + else: + print("WARNING: Have not implemented repair for '{}' interval. Contact developers".format(interval)) + return df_row + + idx = df_row.name + start = idx.date() + if sub_interval=="1h" and (_datetime.date.today()-start) > _datetime.timedelta(days=729): + # Don't bother requesting more price data, Yahoo will reject + return None + else: + new_vals = {} + + if sub_interval=="1h": + df_fine = self.history(start=start, end=start+td_range, interval=sub_interval, auto_adjust=False) + else: + df_fine = self.history(start=start-td_range, end=start+td_range, interval=sub_interval, auto_adjust=False) + + # First, check whether df_fine has different split-adjustment than df_row. + # If it is different, then adjust df_fine to match df_row + good_fields = list(set(data_cols)-set(bad_fields)-set("Adj Close")) + if len(good_fields)==0: + raise Exception("No good fields, so cannot determine whether different split-adjustment. Contact developers") + # median = df_row.loc[good_fields].median() + # median_fine = _np.median(df_fine[good_fields].values) + # ratio = median/median_fine + # Better method to calculate split-adjustment: + df_fine_from_idx = df_fine[df_fine.index>=idx] + ratios = [] + for f in good_fields: + if f=="Low": + ratios.append(df_row[f] / df_fine_from_idx[f].min()) + elif f=="High": + ratios.append(df_row[f] / df_fine_from_idx[f].max()) + elif f=="Open": + ratios.append(df_row[f] / df_fine_from_idx[f].iloc[0]) + elif f=="Close": + ratios.append(df_row[f] / df_fine_from_idx[f].iloc[-1]) + ratio = _np.mean(ratios) + # + ratio_rcp = round(1.0/ratio, 1) ; ratio = round(ratio, 1) + if ratio==1 and ratio_rcp==1: + # Good! + pass + else: + if ratio>1: + # data has different split-adjustment than fine-grained data + # Adjust fine-grained to match + df_fine[data_cols] *= ratio + elif ratio_rcp>1: + # data has different split-adjustment than fine-grained data + # Adjust fine-grained to match + df_fine[data_cols] *= 1.0/ratio_rcp + + if sub_interval != "1h": + df_last_week = df_fine[df_fine.index=idx] + + if "High" in bad_fields: + new_vals["High"] = df_fine["High"].max() + if "Low" in bad_fields: + new_vals["Low"] = df_fine["Low"].min() + if "Open" in bad_fields: + if sub_interval != "1h" and idx != df_fine.index[0]: + # Exchange closed Monday. In this case, Yahoo sets Open to last week close + new_vals["Open"] = df_last_week["Close"][-1] + if "Low" in new_vals: + new_vals["Low"] = min(new_vals["Open"], new_vals["Low"]) + elif new_vals["Open"] < df_row["Low"]: + new_vals["Low"] = new_vals["Open"] + else: + new_vals["Open"] = df_fine["Open"].iloc[0] + if "Close" in bad_fields: + new_vals["Close"] = df_fine["Close"].iloc[-1] + # Assume 'Adj Close' also corrupted, easier than detecting whether true + new_vals["Adj Close"] = df_fine["Adj Close"].iloc[-1] + + return new_vals + def _fix_unit_mixups(self, df, interval, tz_exchange): # Sometimes Yahoo returns few prices in cents/pence instead of $/£ # I.e. 100x bigger @@ -383,24 +483,25 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): # Need multiple rows to confidently identify outliers return df + df2 = df.copy() + if df.index.tz is None: - df.index = df.index.tz_localize(tz_exchange) + df2.index = df2.index.tz_localize(tz_exchange) else: - df.index = df.index.tz_convert(tz_exchange) + df2.index = df2.index.tz_convert(tz_exchange) # Only import scipy if users actually want function. To avoid # adding it to dependencies. from scipy import ndimage as _ndimage - data_cols = ["Open", "High", "Low", "Close"] - data_cols = [c for c in data_cols if c in df.columns] - median = _ndimage.median_filter(df[data_cols].values, size=(3, 3), mode='mirror') + data_cols = ["High", "Open", "Low", "Close"] # Order important, separate High from Low + data_cols = [c for c in data_cols if c in df2.columns] + median = _ndimage.median_filter(df2[data_cols].values, size=(3, 3), mode="wrap") if (median == 0).any(): raise Exception("median contains zeroes, why?") - ratio = df[data_cols].values / median - # ratio_rounded = (ratio/5).round()*5 # round ratio to nearest 5 - ratio_rounded = (ratio / 10).round() * 10 # round ratio to nearest 10 + ratio = df2[data_cols].values / median + ratio_rounded = (ratio / 20).round() * 20 # round ratio to nearest 20 f = ratio_rounded == 100 # Store each mixup: @@ -410,103 +511,22 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): if fj.any(): dc = data_cols[j] for i in _np.where(fj)[0]: - idx = df.index[i] + idx = df2.index[i] if idx not in mixups: - mixups[idx] = {"data": df.loc[idx, data_cols], "fields": {dc}} + mixups[idx] = {"data": df2.loc[idx, data_cols], "fields":{dc}} else: mixups[idx]["fields"].add(dc) n_mixups = len(mixups) if len(mixups) > 0: - # Problem with Yahoo's mixup is they calculate high & low after, so they can be corrupted. - # If interval is weekly then can correct with daily. But if smaller intervals then - # restricted to recent times: - # - daily = hourly restricted to last 730 days - sub_interval = None - td_range = None - if interval == "1wk": - # Correct by fetching week of daily data - sub_interval = "1d" - td_range = _datetime.timedelta(days=7) - elif interval == "1d": - # Correct by fetching day of hourly data - sub_interval = "1h" - td_range = _datetime.timedelta(days=1) - else: - print("WARNING: Have not implemented repair for '{}' interval. Contact developers".format(interval)) - return df - - # This first pass will correct all errors in Open/Close/Adj Close columns. - # It will also *attempt* to correct Low/High columns, but only if can get price data. + # This first pass will correct all errors in Open/Close/AdjClose columns. + # It will also attempt to correct Low/High columns, but only if can get price data. for idx in sorted(list(mixups.keys())): m = mixups[idx] - # Although only some fields in row exhibit 100x error, normally the other fields are also corrupted, - # so need to recalculate all fields in row. - - if td_range is None: - raise Exception("was hoping this wouldn't happen") - - start = idx.date() - if sub_interval == "1h" and (_datetime.date.today() - start) > _datetime.timedelta(days=729): - # Don't bother requesting more price data, Yahoo will reject - pass - else: - if sub_interval == "1h": - df_fine = self.history(start=idx.date(), end=idx.date() + td_range, interval=sub_interval, - auto_adjust=False) - else: - df_fine = self.history(start=idx.date() - td_range, end=idx.date() + td_range, - interval=sub_interval, auto_adjust=False) - - # First, check whether df_fine has different split-adjustment than df. - # If it is different, then adjust df_fine to match df - good_fields = list(set(data_cols) - m["fields"]) - median = df.loc[idx, good_fields].median() - median_fine = _np.median(df_fine[good_fields].values) - ratio = round(median / median_fine, 1) - ratio_rcp = round(median_fine / median, 1) - if ratio == 1 and ratio_rcp == 1: - # Good! - pass - else: - if ratio > 1: - # data has different split-adjustment than fine-grained data - # Adjust fine-grained to match - df_fine[data_cols] *= ratio - elif ratio_rcp > 1: - # data has different split-adjustment than fine-grained data - # Adjust fine-grained to match - df_fine[data_cols] *= 1.0 / ratio_rcp - median_fine = _np.median(df_fine[good_fields].values) - ratio = round(median / median_fine, 1) - ratio_rcp = round(median_fine / median, 1) - - if sub_interval != "1h": - # dt_before_week = df_fine.index[df_fine.index.get_loc(idx)-1] - df_last_week = df_fine[df_fine.index < idx] - df_fine = df_fine[df_fine.index >= idx] - - if "High" in m["fields"]: - df.loc[idx, "High"] = df_fine["High"].max() - m["fields"].remove("High") - if "Low" in m["fields"]: - df.loc[idx, "Low"] = df_fine["Low"].min() - m["fields"].remove("Low") - if "Open" in m["fields"]: - if sub_interval != "1h" and idx != df_fine.index[0]: - # Exchange closed Monday. In this case, Yahoo sets Open to last week close - df.loc[idx, "Open"] = df_last_week["Close"][-1] - df.loc[idx, "Low"] = min(df.loc[idx, "Open"], df.loc[idx, "Low"]) - else: - df.loc[idx, "Open"] = df_fine["Open"].iloc[0] - m["fields"].remove("Open") - if "Close" in m["fields"]: - df.loc[idx, "Close"] = df_fine["Close"].iloc[-1] - m["fields"].remove("Close") - # Assume 'Adj Close' also corrupted, easier than detecting whether true - df.loc[idx, "Adj Close"] = df_fine["Adj Close"].iloc[-1] - - if len(m["fields"]) == 0: + new_values = self._reconstruct_interval(df2.loc[idx], interval, m["fields"]) + if not new_values is None: + for k in new_values: + df2.loc[idx, k] = new_values[k] del mixups[idx] # This second pass will *crudely* "fix" any remaining errors in High/Low @@ -514,12 +534,12 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): if len(mixups) > 0: for idx in sorted(list(mixups.keys())): m = mixups[idx] - row = df.loc[idx, ["Open", "Close"]] + row = df2.loc[idx, ["Open", "Close"]] if "High" in m["fields"]: - df.loc[idx, "High"] = row.max() + df2.loc[idx, "High"] = row.max() m["fields"].remove("High") if "Low" in m["fields"]: - df.loc[idx, "Low"] = row.min() + df2.loc[idx, "Low"] = row.min() m["fields"].remove("Low") if len(m["fields"]) == 0: @@ -530,7 +550,43 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): if len(mixups) > 0: print(" ... and failed to correct {}".format(len(mixups))) - return df + return df2 + + def _fix_zero_prices(self, df, interval, tz_exchange): + # Sometimes Yahoo returns prices=0 when obviously wrong e.g. Volume>0 and Close>0. + # Easy to detect and fix + + if df.shape[0] == 0: + return df + if df.shape[0] == 1: + # Need multiple rows to confidently identify outliers + return df + + df2 = df.copy() + + if df2.index.tz is None: + df2.index = df2.index.tz_localize(tz_exchange) + else: + df2.index = df2.index.tz_convert(tz_exchange) + + data_cols = ["Open","High","Low","Close"] + data_cols = [c for c in data_cols if c in df2.columns] + f_zeroes = (df2[data_cols]==0.0).values.any(axis=1) + + n_fixed = 0 + for i in _np.where(f_zeroes)[0]: + idx = df2.index[i] + df_row = df2.loc[idx] + bad_fields = df2.columns[df_row.values==0.0].values + new_values = self._reconstruct_interval(df2.loc[idx], interval, bad_fields) + if not new_values is None: + for k in new_values: + df2.loc[idx, k] = new_values[k] + n_fixed += 1 + + if n_fixed>0: + print("{}: fixed {} price=0.0 errors in {} price data".format(self.ticker, n_fixed, interval)) + return df2 def _get_ticker_tz(self, debug_mode, proxy, timeout): if self._tz is not None: