From 7de905de13f94ab07aa2b655181d0d7c90b19577 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 13 Jul 2021 11:12:28 +0100 Subject: [PATCH] MAINT: Final fixes Fix all broken Yahoo readers Test against pandas 1 Change minimums --- pandas_datareader/_utils.py | 5 +- pandas_datareader/base.py | 8 +-- pandas_datareader/compat/__init__.py | 15 ++-- pandas_datareader/naver.py | 2 +- pandas_datareader/tests/test_econdb.py | 2 +- pandas_datareader/tests/test_famafrench.py | 66 +++++++++-------- pandas_datareader/tests/test_fred.py | 4 +- pandas_datareader/tests/yahoo/test_options.py | 2 +- pandas_datareader/tests/yahoo/test_yahoo.py | 70 +++++++++---------- pandas_datareader/yahoo/_headers.py | 13 ++++ pandas_datareader/yahoo/actions.py | 2 +- pandas_datareader/yahoo/daily.py | 29 ++------ pandas_datareader/yahoo/quotes.py | 23 ++++++ setup.cfg | 13 ++++ 14 files changed, 149 insertions(+), 105 deletions(-) create mode 100644 pandas_datareader/yahoo/_headers.py diff --git a/pandas_datareader/_utils.py b/pandas_datareader/_utils.py index 28870f0b..245cb33a 100644 --- a/pandas_datareader/_utils.py +++ b/pandas_datareader/_utils.py @@ -53,8 +53,11 @@ def _sanitize_dates(start, end): return start, end -def _init_session(session, retry_count=3): +def _init_session(session): if session is None: session = requests.Session() # do not set requests max_retries here to support arbitrary pause + else: + if not isinstance(session, requests.Session): + raise TypeError("session must be a request.Session") return session diff --git a/pandas_datareader/base.py b/pandas_datareader/base.py index f589c73b..f60d2665 100644 --- a/pandas_datareader/base.py +++ b/pandas_datareader/base.py @@ -70,7 +70,7 @@ def __init__( self.pause = pause self.timeout = timeout self.pause_multiplier = 1 - self.session = _init_session(session, retry_count) + self.session = _init_session(session) self.freq = freq self.headers = None @@ -148,11 +148,7 @@ def _get_response(self, url, params=None, headers=None): params : dict or None parameters passed to the URL """ - - # Use default headers if not passes and not using a user session - if headers is None: - headers = self.headers - + headers = headers or self.headers pause = self.pause last_response_text = "" for _ in range(self.retry_count + 1): diff --git a/pandas_datareader/compat/__init__.py b/pandas_datareader/compat/__init__.py index 26d502a5..5eba99d8 100644 --- a/pandas_datareader/compat/__init__.py +++ b/pandas_datareader/compat/__init__.py @@ -39,11 +39,16 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): # for "get_filepath_or_buffer" starting in pandas >= 0.20.0 if isinstance(filepath_or_buffer, dict): return filepath_or_buffer, encoding, compression - - tmp = com._get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding, compression=None - ) - return tmp.filepath_or_buffer, tmp.encoding, tmp.compression + try: + tmp = com._get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding, compression=None + ) + return tmp.filepath_or_buffer, tmp.encoding, tmp.compression + except AttributeError: + tmp = com.get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding, compression=None + ) + return tmp string_types = (str,) diff --git a/pandas_datareader/naver.py b/pandas_datareader/naver.py index 0c45ae0c..7187456b 100644 --- a/pandas_datareader/naver.py +++ b/pandas_datareader/naver.py @@ -35,7 +35,7 @@ def __init__( if not isinstance(symbols, string_types): raise NotImplementedError("Bulk-fetching is not implemented") - super(NaverDailyReader, self).__init__( + super().__init__( symbols=symbols, start=start, end=end, diff --git a/pandas_datareader/tests/test_econdb.py b/pandas_datareader/tests/test_econdb.py index f8891444..dadec47a 100644 --- a/pandas_datareader/tests/test_econdb.py +++ b/pandas_datareader/tests/test_econdb.py @@ -58,7 +58,7 @@ def test_get_tourism(self): start=pd.Timestamp("2008-01-01"), end=pd.Timestamp("2012-01-01"), ) - df = df.astype(np.float) + df = df.astype(float) jp = np.array([8351000, 6790000, 8611000, 6219000, 8368000], dtype=float) us = np.array( [175702304, 160507424, 164079728, 167600272, 171320416], dtype=float diff --git a/pandas_datareader/tests/test_famafrench.py b/pandas_datareader/tests/test_famafrench.py index 201a5367..4259a454 100644 --- a/pandas_datareader/tests/test_famafrench.py +++ b/pandas_datareader/tests/test_famafrench.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd from pandas import testing as tm import pytest @@ -48,49 +49,49 @@ def test_f_f_research(self): { "Mkt-RF": [ -3.36, - 3.4, + 3.40, 6.31, - 2.0, + 2.00, -7.89, - -5.56, + -5.57, 6.93, -4.77, 9.54, 3.88, - 0.6, + 0.60, 6.82, ], "SMB": [ - 0.38, - 1.2, - 1.42, - 4.98, - 0.05, - -1.97, - 0.16, - -3.00, - 3.92, - 1.15, - 3.70, - 0.7, + 0.37, + 1.19, + 1.44, + 4.86, + 0.14, + -1.84, + 0.18, + -3.02, + 3.93, + 1.07, + 3.78, + 0.68, ], "HML": [ - 0.31, - 3.16, - 2.1, - 2.81, - -2.38, - -4.5, - -0.27, - -1.95, - -3.12, - -2.59, - -0.9, - 3.81, + 0.33, + 3.19, + 2.11, + 2.91, + -2.39, + -4.52, + -0.36, + -1.90, + -3.23, + -2.46, + -0.95, + 3.64, ], "RF": [ - 0.0, - 0.0, + 0.00, + 0.00, 0.01, 0.01, 0.01, @@ -106,7 +107,10 @@ def test_f_f_research(self): index=pd.period_range("2010-01-01", "2010-12-01", freq="M", name="Date"), columns=["Mkt-RF", "SMB", "HML", "RF"], ) - tm.assert_frame_equal(results[0], exp, check_less_precise=0) + received = results[0] + np.testing.assert_allclose(received, exp) + tm.assert_index_equal(received.index, exp.index) + tm.assert_index_equal(received.columns, exp.columns) def test_me_breakpoints(self): results = web.DataReader( diff --git a/pandas_datareader/tests/test_fred.py b/pandas_datareader/tests/test_fred.py index 3029ee18..7776afc7 100644 --- a/pandas_datareader/tests/test_fred.py +++ b/pandas_datareader/tests/test_fred.py @@ -72,7 +72,9 @@ def test_fred_multi(self): # pragma: no cover ) expected.index.rename("DATE", inplace=True) expected.index.freq = "MS" - tm.assert_frame_equal(received, expected, check_less_precise=True) + np.testing.assert_allclose(received, expected) + tm.assert_index_equal(received.index, expected.index) + tm.assert_index_equal(received.columns, expected.columns) def test_fred_multi_bad_series(self): names = ["NOTAREALSERIES", "CPIAUCSL", "ALSO FAKE"] diff --git a/pandas_datareader/tests/yahoo/test_options.py b/pandas_datareader/tests/yahoo/test_options.py index 3c4c726d..d37839d8 100644 --- a/pandas_datareader/tests/yahoo/test_options.py +++ b/pandas_datareader/tests/yahoo/test_options.py @@ -9,7 +9,7 @@ from pandas_datareader import data as web -@pytest.yield_fixture +@pytest.fixture def aapl(): aapl = web.Options("aapl", "yahoo") yield aapl diff --git a/pandas_datareader/tests/yahoo/test_yahoo.py b/pandas_datareader/tests/yahoo/test_yahoo.py index 80ad951f..4fa7e524 100644 --- a/pandas_datareader/tests/yahoo/test_yahoo.py +++ b/pandas_datareader/tests/yahoo/test_yahoo.py @@ -159,7 +159,7 @@ def test_get_data_null_as_missing_data(self, adj_pr): else: floats.append("Adj Close") - assert result[floats].dtypes.all() == np.floating + assert result[floats].dtypes.all() == np.float64 @skip_on_exception(RemoteDataError) def test_get_data_multiple_symbols_two_dates(self): @@ -168,7 +168,7 @@ def test_get_data_multiple_symbols_two_dates(self): assert result.size == 3 # sanity checking - assert result.dtypes == np.floating + assert result.dtypes == np.float64 expected = np.array( [ @@ -207,12 +207,12 @@ def test_get_data_yahoo_actions(self): assert actions.loc["2005-02-28", "value"][0] == 1 / 2.0 assert actions.loc["1995-11-21", "action"][0] == "DIVIDEND" - assert round(actions.loc["1995-11-21", "value"][0], 3) == 0.120 + assert round(actions.loc["1995-11-21", "value"][0], 3) == 0.030 actions = web.get_data_yahoo_actions("AAPL", start, end, adjust_dividends=True) assert actions.loc["1995-11-21", "action"][0] == "DIVIDEND" - assert round(actions.loc["1995-11-21", "value"][0], 4) == 0.0043 + assert round(actions.loc["1995-11-21", "value"][0], 4) == 0.0011 def test_get_data_yahoo_actions_invalid_symbol(self): start = datetime(1990, 1, 1) @@ -226,14 +226,14 @@ def test_yahoo_reader_class(self): r = YahooDailyReader("GOOG", start="JAN-01-2015") df = r.read() - assert df.Volume.loc["JAN-02-2015"] == 1447500 + assert df.Volume.loc["JAN-02-2015"] == 1447563 session = requests.Session() r = YahooDailyReader("GOOG", session=session) assert r.session is session - def test_yahoo_DataReader(self): + def test_yahoo_datareader(self): start = datetime(2010, 1, 1) end = datetime(2015, 5, 9) # yahoo will adjust for dividends by default @@ -275,19 +275,19 @@ def test_yahoo_DataReader(self): "DIVIDEND", ], "value": [ - 0.52, - 0.47, - 0.47, - 0.47, - 0.14285714, - 0.47, - 0.43571, - 0.43571, - 0.43571, - 0.43571, - 0.37857, - 0.37857, - 0.37857, + 0.130000, + 0.117500, + 0.117500, + 0.117500, + 0.142857, + 0.117500, + 0.108929, + 0.108929, + 0.108929, + 0.108929, + 0.094643, + 0.094643, + 0.094643, ], }, index=exp_idx, @@ -316,19 +316,19 @@ def test_yahoo_DataReader(self): "DIVIDEND", ], "value": [ - 0.52, - 0.47, - 0.47, - 0.47, - 0.14285714, - 3.29, - 3.05, - 3.05, - 3.05, - 3.05, - 2.65, - 2.65, - 2.65, + 0.1300, + 0.1175, + 0.1175, + 0.1175, + 0.1429, + 0.8225, + 0.7625, + 0.7625, + 0.7625, + 0.7625, + 0.6625, + 0.6625, + 0.6625, ], }, index=exp_idx, @@ -344,13 +344,13 @@ def test_yahoo_DataReader(self): result = web.DataReader("NTR", "yahoo-actions", start, end) exp_idx = pd.DatetimeIndex( - ["2018-12-28", "2018-09-27", "2018-06-28", "2018-03-28", "2018-01-02"] + ["2018-12-28", "2018-09-27", "2018-06-28", "2018-03-28"] ) exp = pd.DataFrame( { - "action": ["DIVIDEND", "DIVIDEND", "DIVIDEND", "DIVIDEND", "SPLIT"], - "value": [0.43, 0.40, 0.40, 0.40, 1.00], + "action": ["DIVIDEND", "DIVIDEND", "DIVIDEND", "DIVIDEND"], + "value": [0.43, 0.40, 0.40, 0.40], }, index=exp_idx, ) diff --git a/pandas_datareader/yahoo/_headers.py b/pandas_datareader/yahoo/_headers.py new file mode 100644 index 00000000..dbc8932b --- /dev/null +++ b/pandas_datareader/yahoo/_headers.py @@ -0,0 +1,13 @@ +""" +Default header +""" +DEFAULT_HEADERS = { + "Connection": "keep-alive", + "Expires": str(-1), + "Upgrade-Insecure-Requests": str(1), + # Google Chrome: + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ), +} diff --git a/pandas_datareader/yahoo/actions.py b/pandas_datareader/yahoo/actions.py index 48a0f549..f4a81ee8 100644 --- a/pandas_datareader/yahoo/actions.py +++ b/pandas_datareader/yahoo/actions.py @@ -12,7 +12,7 @@ class YahooActionReader(YahooDailyReader): """ def read(self): - data = super(YahooActionReader, self).read() + data = super().read() actions = {} if isinstance(data.columns, MultiIndex): data = data.swaplevel(0, 1, axis=1) diff --git a/pandas_datareader/yahoo/daily.py b/pandas_datareader/yahoo/daily.py index 0ade5e59..eda47402 100644 --- a/pandas_datareader/yahoo/daily.py +++ b/pandas_datareader/yahoo/daily.py @@ -8,6 +8,7 @@ from pandas_datareader._utils import RemoteDataError from pandas_datareader.base import _DailyBaseReader +from pandas_datareader.yahoo._headers import DEFAULT_HEADERS class YahooDailyReader(_DailyBaseReader): @@ -35,9 +36,8 @@ class YahooDailyReader(_DailyBaseReader): single value given for symbol, represents the pause between retries. session : Session, default None requests.sessions.Session instance to be used. Passing a session - is an advanced usage and you must either set the required - headers in the session directly or explicitly override - using the ``headers`` argument. + is an advanced usage and you must set any required + headers in the session directly. adjust_price : bool, default False If True, adjusts all prices in hist_data ('Open', 'High', 'Low', 'Close') based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops @@ -53,9 +53,6 @@ class YahooDailyReader(_DailyBaseReader): If True, adds Dividend and Split columns to dataframe. adjust_dividends: bool, default true If True, adjusts dividends for splits. - headers : dict, optional - Headers to use when reading data. If None (the default), a - standard set of headers is used. """ def __init__( @@ -72,9 +69,8 @@ def __init__( interval="d", get_actions=False, adjust_dividends=True, - headers=None, ): - super(YahooDailyReader, self).__init__( + super().__init__( symbols=symbols, start=start, end=end, @@ -87,21 +83,10 @@ def __init__( # Ladder up the wait time between subsequent requests to improve # probability of a successful retry self.pause_multiplier = 2.5 - if headers is not None: - self.headers = headers - elif session is None: - self.headers = { - "Connection": "keep-alive", - "Expires": str(-1), - "Upgrade-Insecure-Requests": str(1), - # Google Chrome: - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - ), - } + if session is None: + self.headers = DEFAULT_HEADERS else: - self.headers = None + self.headers = session.headers self.adjust_price = adjust_price self.ret_index = ret_index diff --git a/pandas_datareader/yahoo/quotes.py b/pandas_datareader/yahoo/quotes.py index 4dddff3b..fba12548 100644 --- a/pandas_datareader/yahoo/quotes.py +++ b/pandas_datareader/yahoo/quotes.py @@ -5,6 +5,7 @@ from pandas_datareader.base import _BaseReader from pandas_datareader.compat import string_types +from pandas_datareader.yahoo._headers import DEFAULT_HEADERS _DEFAULT_PARAMS = { "lang": "en-US", @@ -17,6 +18,28 @@ class YahooQuotesReader(_BaseReader): """Get current yahoo quote""" + def __init__( + self, + symbols=None, + start=None, + end=None, + retry_count=3, + pause=0.1, + session=None, + ): + super().__init__( + symbols=symbols, + start=start, + end=end, + retry_count=retry_count, + pause=pause, + session=session, + ) + if session is not None: + self.headers = session.headers + else: + self.headers = DEFAULT_HEADERS + @property def url(self): return "https://query1.finance.yahoo.com/v7/finance/quote" diff --git a/setup.cfg b/setup.cfg index 704fd846..6118ea57 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,6 +24,19 @@ markers = requires_api_key: mark a test as requiring an API key alpha_vantage: mark a test of the AlphaVantage reader quandl: mark a test of the Quandl reader +filterwarnings = + ignore:`np.bool` is a deprecated alias:DeprecationWarning:pandas.core.indexes + ignore:`np.object` is a deprecated alias:DeprecationWarning:pandas.core.indexes + ignore:`np.float` is a deprecated alias:DeprecationWarning:pandas.core.indexes + ignore:`np.complex` is a deprecated alias:DeprecationWarning:pandas.core.indexes + ignore:`np.bool` is a deprecated alias:DeprecationWarning:pandas.core.internals.blocks + ignore:`np.object` is a deprecated alias:DeprecationWarning:pandas.core.internals.blocks + ignore:`np.object` is a deprecated alias:DeprecationWarning:pandas.core.internals.construction + ignore:`np.object` is a deprecated alias:DeprecationWarning:pandas.io.parsers + ignore:`np.object` is a deprecated alias:DeprecationWarning:pandas.core.dtypes.cast + ignore:`np.float` is a deprecated alias:DeprecationWarning:pandas.core.internals.blocks + ignore:`np.complex` is a deprecated alias:DeprecationWarning:pandas.core.internals.blocks + ignore:Converting `np.inexact` or `np.floating` to a dtype:DeprecationWarning:pandas.core.indexes [flake8] ignore = E203, E266, E501, W503