Issue 646/scraper weightwatchers (#657)

hhursev · Oct 29, 2022 · 954c5c6 · 954c5c6
1 parent 2e7d0e1
commit 954c5c6
Show file tree

Hide file tree

Showing 11 changed files with 904 additions and 1 deletion.
diff --git a/README.rst b/README.rst
@@ -293,6 +293,7 @@ Scrapers available for:
 - `https://vegolosi.it/ <https://vegolosi.it>`_
 - `https://vegrecipesofindia.com/ <https://www.vegrecipesofindia.com/>`_
 - `https://watchwhatueat.com/ <https://watchwhatueat.com/>`_
+- `https://www.weightwatchers.com/ <https://www.weightwatchers.com/>`_(*)
 - `https://whatsgabycooking.com/ <https://whatsgabycooking.com>`_
 - `https://www.wholefoodsmarket.com/ <https://www.wholefoodsmarket.com/>`_
 - `https://www.wholefoodsmarket.co.uk/ <https://www.wholefoodsmarket.co.uk/>`_
@@ -304,6 +305,7 @@ Scrapers available for:
 - `https://zeit.de/ (wochenmarkt) <https://www.zeit.de/zeit-magazin/wochenmarkt/index>`_
 - `https://zenbelly.com/ <https://zenbelly.com>`_
 
+(*) offline saved files only. Page requires login
 
 Contribute
 ----------

diff --git a/recipe_scrapers/__init__.py b/recipe_scrapers/__init__.py
@@ -211,6 +211,8 @@
 from .vegolosi import Vegolosi
 from .vegrecipesofindia import VegRecipesOfIndia
 from .watchwhatueat import WatchWhatUEat
+from .weightwatchers import WeightWatchers
+from .weightwatcherspublic import WeightWatchersPublic
 from .whatsgabycooking import WhatsGabyCooking
 from .wholefoods import WholeFoods
 from .wikicookbook import WikiCookbook
@@ -441,6 +443,8 @@
     VegRecipesOfIndia.host(): VegRecipesOfIndia,
     Vegolosi.host(): Vegolosi,
     WatchWhatUEat.host(): WatchWhatUEat,
+    WeightWatchers.host(): WeightWatchers,
+    WeightWatchersPublic.host(): WeightWatchersPublic,
     WhatsGabyCooking.host(): WhatsGabyCooking,
     WholeFoods.host(): WholeFoods,
     WholeFoods.host(domain="co.uk"): WholeFoods,

diff --git a/recipe_scrapers/_utils.py b/recipe_scrapers/_utils.py
@@ -20,7 +20,7 @@
 }
 
 TIME_REGEX = re.compile(
-    r"(\D*(?P<hours>[\d.\s/?¼½¾⅓⅔⅕⅖⅗]+)\s*(hours|hrs|hr|h|óra))?(\D*(?P<minutes>\d+)\s*(minutes|mins|min|m|perc))?",
+    r"(\D*(?P<hours>[\d.\s/?¼½¾⅓⅔⅕⅖⅗]+)\s*(hours|hrs|hr|h|óra|:))?(\D*(?P<minutes>\d+)\s*(minutes|mins|min|m|perc|$))?",
     re.IGNORECASE,
 )
 

diff --git a/recipe_scrapers/weightwatchers.py b/recipe_scrapers/weightwatchers.py
@@ -0,0 +1,160 @@
+# mypy: allow-untyped-defs
+
+import re
+
+from ._abstract import AbstractScraper
+from ._utils import get_minutes, get_yields, normalize_string
+
+
+class WeightWatchers(AbstractScraper):
+    @classmethod
+    def host(cls):
+        return "www.weightwatchers.com"
+
+    def author(self):
+        return "WeightWatchers"
+
+    def title(self):
+        return self.soup.find("h1").get_text().strip()
+
+    def category(self):
+        return "WeightWatchers"
+
+    # cooking times, yield, difficulty are in a common div in public and non-public recipes
+    # but class of that block and sub elements are different
+    # so finding the block and extracting a value will be overridden in class for public recipes,
+    # but picking the data item based on order is don in this base class (total_time(), cook_time() and so on)
+    def _findDataContainer(self):
+        return self.soup.find("div", {"class": "styles_container__3N3E8"})
+
+    def _extractItemField(self, item):
+        return item.contents[1]
+
+    def total_time(self):
+        return get_minutes(
+            self._extractItemField(self._findDataContainer().contents[0])
+        )
+
+    def prep_time(self):
+        return get_minutes(
+            self._extractItemField(self._findDataContainer().contents[1])
+        )
+
+    def cook_time(self):
+        return get_minutes(
+            self._extractItemField(self._findDataContainer().contents[2])
+        )
+
+    def yields(self):
+        return get_yields(self._extractItemField(self._findDataContainer().contents[3]))
+
+    def difficulty(self):
+        return self._extractItemField(self._findDataContainer().contents[4]).get_text()
+
+    #   Alternative way to extract data based on description instead of position
+    #    def total_time(self):
+    #        return get_minutes(
+    #            self.__findDataContainer()
+    #            .find("div", string=re.compile(r"minutes Total Time"))
+    #            .previous_sibling
+    #        )
+
+    def image(self):
+        backgroundImgStyle = self.soup.find("div", {"class": "styles_image__2dnNm"})[
+            "style"
+        ]
+
+        if backgroundImgStyle:
+            return (
+                re.search(r'url\("(?P<imgurl>\S*)"\);', backgroundImgStyle)
+                .groupdict()
+                .get("imgurl")
+            )
+
+        return None
+
+    def _findIngridientTags(self):
+        return self.soup.find(
+            "h3", {"id": "food-detail-recipe-ingredients-header"}
+        ).parent.find_all("div", {"class": "styles_name__1OYVU"})
+
+    def _extractIngridientName(self, ingridient):
+        return normalize_string(
+            ingridient.find("div", {"class": "styles_ingredientName__1Vffd"})
+            .find("div")
+            .get_text()
+        )
+
+    def _extractPortionParts(self, ingridient):
+        tags = ingridient.find("div", {"class": "styles_portion__2NQyq"}).find_all(
+            "span"
+        )
+        try:
+            return (
+                normalize_string(tags[0].get_text()),
+                normalize_string(tags[1].get_text()),
+                normalize_string(tags[2].get_text().replace(", ", ""))
+                if tags[2]
+                else None,
+            )
+        except IndexError:
+            return (
+                normalize_string(tags[0].get_text()),
+                normalize_string(tags[1].get_text()),
+                None,
+            )
+
+    def __parseIngridient(self, ingridient):
+        ingridientName = self._extractIngridientName(ingridient)
+        amount, unit, comment = self._extractPortionParts(ingridient)
+
+        if comment:
+            return f"{amount} {unit} {ingridientName}; {comment}"
+        else:
+            return f"{amount} {unit} {ingridientName}"
+
+    def ingredients(self):
+        return [
+            self.__parseIngridient(ingridient)
+            for ingridient in self._findIngridientTags()
+        ]
+
+    def _getInstructions(self, headertag, headerattribute, headervalue, instructiontag):
+        instructions = self.soup.find(
+            headertag, {headerattribute: headervalue}
+        ).parent.find("ol")
+        return "\n".join(
+            [
+                normalize_string(instruction.get_text())
+                for instruction in instructions.find_all(instructiontag)
+            ]
+        )
+
+    def instructions(self):
+        return self._getInstructions(
+            "h3", "id", "food-detail-recipe-instruction-header", "div"
+        )
+
+    def description(self):
+        return self.soup.find("div", {"class": "copy-1"}).get_text().strip()
+
+    def nutrients(self):
+        result = {}
+
+        result["personal points"] = (
+            self.soup.find("div", {"class": "styles_points__2gv9n"})
+            .find("div", {"class": "styles_container__2p-YG"})
+            .get_text()
+        )
+
+        veggiepoints = self.soup.find(
+            "div", {"class": "styles_vegetableServings__2YSPy"}
+        )
+        if veggiepoints:
+            result["positive points"] = normalize_string(
+                veggiepoints.find(
+                    "div", {"class": "styles_container__2p-YG"}
+                ).next_sibling.get_text()
+            )
+
+        return result
diff --git a/recipe_scrapers/weightwatcherspublic.py b/recipe_scrapers/weightwatcherspublic.py
@@ -0,0 +1,66 @@
+# mypy: allow-untyped-defs
+
+from ._utils import normalize_string
+from .weightwatchers import WeightWatchers
+
+
+# collect the differences between public and non-public weightwatcher recipes in this class
+class WeightWatchersPublic(WeightWatchers):
+    @classmethod
+    def host(cls):
+        return "www.weightwatchers.com"
+
+    def _findDataContainer(self):
+        return self.soup.find("div", {"class": "HorizontalList_list__GESs0"})
+
+    def _extractItemField(self, item):
+        return item.find("div", {"data-e2e-name": "attribute_item_value"})
+
+    def image(self):
+        return self.soup.find("img", {"class": "FoodMasthead_heroImage__BjVdZ"})["src"]
+
+    def nutrients(self):
+        return {
+            "points": self.soup.find("div", {"class": "Coin_text__3UOb0"})["aria-label"]
+        }
+
+    def description(self):
+        return normalize_string(
+            self.soup.find("div", {"data-e2e-name": "food_masthead_detail_description"})
+            .find("div", {"class": "ReadMoreLess_collapsed__IAzxP"})
+            .get_text()
+        )
+
+    def instructions(self):
+        return self._getInstructions(
+            "h2", "class", "InstructionsFood_headline__vw7cn", "span"
+        )
+
+    def _findIngridientTags(self):
+        return (
+            self.soup.find("div", {"class": "IngredientsCard_card__VSY4x"})
+            .find("div", {"data-e2e-name": "vertical_list_items"})
+            .find_all("div", recursive=False)
+        )
+
+    def _extractIngridientName(self, ingridient):
+        return normalize_string(
+            ingridient.find("p", {"data-e2e-name": "ingredient_name"}).get_text()
+        )
+
+    def _extractPortionParts(self, ingridient):
+        tags = ingridient.find(
+            "p", {"data-e2e-name": "ingredient_description"}
+        ).find_all("span")
+
+        comment = None
+        unit = None
+        if len(tags) > 2:
+            comment = normalize_string(tags[2].get_text().replace(", ", "", 1))
+            unit = normalize_string(tags[1].get_text())
+        else:
+            descriptionParts = normalize_string(tags[1].get_text()).split(", ", 1)
+            unit = descriptionParts[0]
+            comment = descriptionParts[1] if len(descriptionParts) > 1 else None
+
+        return (normalize_string(tags[0].get_text()), unit, comment)
diff --git a/tests/test_data/weightwatchers.testhtml b/tests/test_data/weightwatchers.testhtml
diff --git a/tests/test_data/weightwatchers_2.testhtml b/tests/test_data/weightwatchers_2.testhtml
diff --git a/tests/test_data/weightwatcherspublic.testhtml b/tests/test_data/weightwatcherspublic.testhtml
diff --git a/tests/test_weightwatchers.py b/tests/test_weightwatchers.py
@@ -0,0 +1,83 @@
+# mypy: allow-untyped-defs
+
+from recipe_scrapers.weightwatchers import WeightWatchers
+from tests import ScraperTest
+
+
+class TestWeightwatchersScraper(ScraperTest):
+
+    # Test-Url:
+    # https://cmx.weightwatchers.de/details/WWRECIPE:5667ab72a29713e4335bb342
+
+    scraper_class = WeightWatchers
+
+    def test_host(self):
+        self.assertEqual("www.weightwatchers.com", self.harvester_class.host())
+
+    def test_author(self):
+        self.assertEqual("WeightWatchers", self.harvester_class.author())
+
+    def test_title(self):
+        self.assertEqual(self.harvester_class.title(), "Würstchengulasch mit Nudeln")
+
+    def test_category(self):
+        self.assertEqual("WeightWatchers", self.harvester_class.category())
+
+    def test_total_time(self):
+        self.assertEqual(25, self.harvester_class.total_time())
+
+    def test_cook_time(self):
+        self.assertEqual(0, self.harvester_class.cook_time())
+
+    def test_prep_time(self):
+        self.assertEqual(25, self.harvester_class.prep_time())
+
+    def test_yields(self):
+        self.assertEqual("2 servings", self.harvester_class.yields())
+
+    def test_image(self):
+        self.assertEqual(
+            "https://cmx.weightwatchers.com/assets-proxy/weight-watchers/image/upload/t_WINE_EXTRALARGE/i34cskr1hxegmxqukawd.jpg",
+            self.harvester_class.image(),
+        )
+
+    def test_ingredients(self):
+        self.assertEqual(
+            [
+                "2 Stück Geflügelwürstchen",
+                "1 Stück, klein Zwiebel/n",
+                "200 g Champignons, frisch; braun",
+                "120 g Nudeln, trocken, jede Sorte; Spiralnudeln",
+                "1 Prise(n) Salz/Jodsalz",
+                "2 TL Pflanzenöl, Rapsöl/Sonnenblumenöl",
+                "400 g Tomaten, passiert",
+                "1 Prise(n) Pfeffer",
+                "1⁄2 TL Paprikapulver",
+            ],
+            self.harvester_class.ingredients(),
+        )
+
+    def test_ingredientsCount(self):
+        self.assertEqual(9, len(self.harvester_class.ingredients()))
+
+    def test_instructions(self):
+        self.assertEqual(
+            "Würstchen in Scheiben schneiden. Zwiebel schälen und würfeln. Champignons trocken abreiben und vierteln. Nudeln nach Packungsanweisung in Salzwasser garen.\nÖl in einem Topf erhitzen und Zwiebelwürfel darin andünsten. Würstchenscheiben und Champignonviertel zufügen und ca. 3 Minuten anbraten. Mit Tomaten ablöschen, aufkochen und ca. 5 Minuten köcheln lassen. Würstchengulasch mit Salz, Pfeffer und Paprikapulver würzen. Nudeln abgießen, untermischen und in einer Frischhaltedose transportieren. Würstchengulasch erwärmen und servieren.",
+            self.harvester_class.instructions(),
+        )
+
+    def test_description(self):
+        self.assertEqual(
+            "18 Uhr und alle haben Hunger? Dann koche rasch das Würstchengulasch und alle sind happy.",
+            self.harvester_class.description(),
+        )
+
+    def test_difficulty(self):
+        self.assertEqual("Leicht", self.harvester_class.difficulty())
+
+    def test_nutrients(self):
+        expected_nutrients = {
+            "personal points": "earn 12 personal points",
+            "positive points": "+2 Punkte von 2 Portion(en) Gemüse",
+        }
+        self.assertEqual(self.harvester_class.nutrients(), expected_nutrients)