From f131ffcf49f2257a7b70f8ddd6fe15663f82ff43 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Oct 2022 22:47:18 +0100 Subject: [PATCH 1/5] Refactor Woolworths scraper to remove use of settings.TEST_MODE --- recipe_scrapers/woolworths.py | 26 ++++++++++++++------------ tests/test_woolworths.py | 6 ++++++ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/recipe_scrapers/woolworths.py b/recipe_scrapers/woolworths.py index c69f10f64..1544eeafb 100644 --- a/recipe_scrapers/woolworths.py +++ b/recipe_scrapers/woolworths.py @@ -1,29 +1,31 @@ # mypy: disallow_untyped_defs=False -import json +import requests -from recipe_scrapers.settings import settings - -from ._abstract import AbstractScraper +from ._abstract import AbstractScraper, HEADERS from ._schemaorg import SchemaOrg from ._utils import url_path_to_dict class Woolworths(AbstractScraper): - def __init__(self, url, *args, **kwargs): - if not settings.TEST_MODE: # pragma: no cover - target = url_path_to_dict(url)["path"].split("/")[-1] - url = f"https://foodhub.woolworths.com.au/content/woolworths-foodhub/en/{target}.model.json" - + def __init__(self, url, proxies=None, timeout=None, *args, **kwargs): super().__init__(url=url, *args, **kwargs) - self.page_data = ( - json.loads(self.page_data) + target = url_path_to_dict(url)["path"].split("/")[-1] + data_url = f"https://foodhub.woolworths.com.au/content/woolworths-foodhub/en/{target}.model.json" + + recipe_json = ( + requests.get( + data_url, + headers=HEADERS, + proxies=proxies, + timeout=timeout, + ).json() .get(":items") .get("root") .get(":items") .get("recipe_seo_data") ) - self.schema = SchemaOrg(self.page_data, raw=True) + self.schema = SchemaOrg(recipe_json, raw=True) @classmethod def host(cls): diff --git a/tests/test_woolworths.py b/tests/test_woolworths.py index 7ba02ad8a..c9f4a9643 100644 --- a/tests/test_woolworths.py +++ b/tests/test_woolworths.py @@ -1,3 +1,4 @@ +from responses import GET from recipe_scrapers.woolworths import Woolworths from tests import ScraperTest @@ -6,6 +7,11 @@ class TestWoolworthsScraper(ScraperTest): scraper_class = Woolworths + @property + def expected_requests(self): + yield GET, "https://www.woolworths.com.au/shop/recipes/asparagus-salad-with-lemon-vinaigrette", "tests/test_data/woolworths.testhtml" + yield GET, "https://foodhub.woolworths.com.au/content/woolworths-foodhub/en/asparagus-salad-with-lemon-vinaigrette.model.json", "tests/test_data/woolworths.testhtml" + def test_host(self): self.assertEqual("woolworths.com.au", self.harvester_class.host()) From 5e12274f9cdef991b63544046ced488bfe74eefb Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Oct 2022 23:01:16 +0100 Subject: [PATCH 2/5] Refactor GoustoJSON scraper to remove use of settings.TEST_MODE --- recipe_scrapers/goustojson.py | 26 ++++++++++++++++---------- tests/test_goustojson.py | 8 ++++++-- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/recipe_scrapers/goustojson.py b/recipe_scrapers/goustojson.py index 16dfe5db2..7c5ff2664 100644 --- a/recipe_scrapers/goustojson.py +++ b/recipe_scrapers/goustojson.py @@ -1,9 +1,7 @@ # mypy: disallow_untyped_defs=False -import json +import requests -from recipe_scrapers.settings import settings - -from ._abstract import AbstractScraper +from ._abstract import AbstractScraper, HEADERS from ._utils import get_minutes, get_yields, normalize_string, url_path_to_dict @@ -13,14 +11,22 @@ class GoustoJson(AbstractScraper): Let's see if it stands the test of time and reevaluate. """ - def __init__(self, url, *args, **kwargs): - if not settings.TEST_MODE: # pragma: no cover - recipe_slug = url_path_to_dict(url).get("path").split("/")[-1] - url = f"https://production-api.gousto.co.uk/cmsreadbroker/v1/recipe/{recipe_slug}" - + def __init__(self, url, proxies=None, timeout=None, *args, **kwargs): super().__init__(url=url, *args, **kwargs) - self.page_data = json.loads(self.page_data).get("data") + recipe_slug = url_path_to_dict(url).get("path").split("/")[-1] + data_url = ( + f"https://production-api.gousto.co.uk/cmsreadbroker/v1/recipe/{recipe_slug}" + ) + + recipe_json = requests.get( + data_url, + headers=HEADERS, + proxies=proxies, + timeout=timeout, + ).json() + + self.page_data = recipe_json.get("data") self.data = self.page_data.get("entry") @classmethod diff --git a/tests/test_goustojson.py b/tests/test_goustojson.py index 483a4acb6..2d0ce8fb0 100644 --- a/tests/test_goustojson.py +++ b/tests/test_goustojson.py @@ -1,3 +1,4 @@ +from responses import GET from recipe_scrapers.goustojson import GoustoJson from tests import ScraperTest @@ -5,8 +6,11 @@ class TestGoustoScraper(ScraperTest): scraper_class = GoustoJson - test_file_name = "gousto" - test_file_extension = "testjson" + + @property + def expected_requests(self): + yield GET, "https://www.gousto.co.uk/cookbook/recipes/malaysian-style-coconut-meat-free-chicken-pickled-cucumber", "tests/test_data/gousto.testjson" + yield GET, "https://production-api.gousto.co.uk/cmsreadbroker/v1/recipe/malaysian-style-coconut-meat-free-chicken-pickled-cucumber", "tests/test_data/gousto.testjson" def test_host(self): self.assertEqual("gousto.co.uk", self.harvester_class.host()) From b0a4204f6f94ef7443bbfa3a63cd2a90e37c2565 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Oct 2022 23:03:18 +0100 Subject: [PATCH 3/5] Cleanup: remove TEST_MODE setting Cleanup task related to #617 --- recipe_scrapers/settings/default.py | 3 --- tests/test_data/test_settings_module/test_settings.py | 1 - 2 files changed, 4 deletions(-) diff --git a/recipe_scrapers/settings/default.py b/recipe_scrapers/settings/default.py index b5351f4f5..1071b9608 100644 --- a/recipe_scrapers/settings/default.py +++ b/recipe_scrapers/settings/default.py @@ -40,9 +40,6 @@ } -TEST_MODE = False - - # logging.DEBUG # 10 # logging.INFO # 20 # logging.WARNING # 30 diff --git a/tests/test_data/test_settings_module/test_settings.py b/tests/test_data/test_settings_module/test_settings.py index 53d6ea671..7cdc32674 100644 --- a/tests/test_data/test_settings_module/test_settings.py +++ b/tests/test_data/test_settings_module/test_settings.py @@ -1,4 +1,3 @@ SUPPRESS_EXCEPTIONS = True -TEST_MODE = True META_HTTP_EQUIV = True # LOG_LEVEL = 20 From af7d713ee3aea3a87a114ed8849b6b4d0bc650c8 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Oct 2022 23:03:34 +0100 Subject: [PATCH 4/5] Fixup: linting for Woolworths --- recipe_scrapers/woolworths.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/recipe_scrapers/woolworths.py b/recipe_scrapers/woolworths.py index 1544eeafb..3bc5422e6 100644 --- a/recipe_scrapers/woolworths.py +++ b/recipe_scrapers/woolworths.py @@ -19,7 +19,8 @@ def __init__(self, url, proxies=None, timeout=None, *args, **kwargs): headers=HEADERS, proxies=proxies, timeout=timeout, - ).json() + ) + .json() .get(":items") .get("root") .get(":items") From 3cd4bdd972fd3241ab218f8ef191b1e70dcf3f1c Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Oct 2022 23:07:51 +0100 Subject: [PATCH 5/5] Consistency / diff-minimization: Woolworths: store retrieved JSON as page_data --- recipe_scrapers/woolworths.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipe_scrapers/woolworths.py b/recipe_scrapers/woolworths.py index 3bc5422e6..b3d01fd28 100644 --- a/recipe_scrapers/woolworths.py +++ b/recipe_scrapers/woolworths.py @@ -13,7 +13,7 @@ def __init__(self, url, proxies=None, timeout=None, *args, **kwargs): target = url_path_to_dict(url)["path"].split("/")[-1] data_url = f"https://foodhub.woolworths.com.au/content/woolworths-foodhub/en/{target}.model.json" - recipe_json = ( + self.page_data = ( requests.get( data_url, headers=HEADERS, @@ -26,7 +26,7 @@ def __init__(self, url, proxies=None, timeout=None, *args, **kwargs): .get(":items") .get("recipe_seo_data") ) - self.schema = SchemaOrg(recipe_json, raw=True) + self.schema = SchemaOrg(self.page_data, raw=True) @classmethod def host(cls):