System health. Scrapers needed polishing (#676)

* Health Check. A LOT of scrapers needed polishing - Adds creativecanning.com/ (same scraper as PracticalSelfReliance) - Removes sallys-blog.de. Scraper did not work. Site is tricky to scrape. Good news is that a lot of scrapers needed polishing because they have schema available! * Post merge with main branch fixes. (needed due to design changes in sites)
hhursev · Oct 30, 2022 · fa61cf2 · fa61cf2
1 parent 1582886
commit fa61cf2
Show file tree

Hide file tree

Showing 252 changed files with 83,246 additions and 98,772 deletions.
diff --git a/README.rst b/README.rst
@@ -122,6 +122,7 @@ Scrapers available for:
 - `https://www.coop.se/ <https://www.coop.se/>`_
 - `https://copykat.com/ <https://copykat.com>`_
 - `https://countryliving.com/ <https://countryliving.com>`_
+- `https://creativecanning.com/ <https://creativecanning.com>`_
 - `https://cucchiaio.it/ <https://cucchiaio.it>`_
 - `https://cuisineaz.com/ <https://cuisineaz.com>`_
 - `https://cybercook.com.br/ <https://cybercook.com.br/>`_
@@ -253,7 +254,6 @@ Scrapers available for:
 - `https://rezeptwelt.de/ <https://rezeptwelt.de>`_
 - `https://rosannapansino.com <https://rosannapansino.com>`_
 - `https://sallysbakingaddiction.com <https://sallysbakingaddiction.com/>`_
-- `https://sallys-blog.de <https://sallys-blog.de/>`_
 - `https://www.saveur.com/ <https://www.saveur.com/>`_
 - `https://seriouseats.com/ <https://seriouseats.com>`_
 - `https://simple-veganista.com/ <https://simple-veganista.com/>`_

diff --git a/recipe_scrapers/__init__.py b/recipe_scrapers/__init__.py
@@ -169,7 +169,6 @@
 from .rezeptwelt import Rezeptwelt
 from .rosannapansino import RosannaPansino
 from .sallysbakingaddiction import SallysBakingAddiction
-from .sallysblog import SallysBlog
 from .saveur import Saveur
 from .seriouseats import SeriousEats
 from .simpleveganista import SimpleVeganista
@@ -386,6 +385,7 @@
     PingoDoce.host(): PingoDoce,
     PopSugar.host(): PopSugar,
     PracticalSelfReliance.host(): PracticalSelfReliance,
+    PracticalSelfReliance.host(domain="creativecanning.com"): PracticalSelfReliance,
     PrimalEdgeHealth.host(): PrimalEdgeHealth,
     Przepisy.host(): Przepisy,
     PurelyPope.host(): PurelyPope,
@@ -401,7 +401,6 @@
     Rezeptwelt.host(): Rezeptwelt,
     RosannaPansino.host(): RosannaPansino,
     SallysBakingAddiction.host(): SallysBakingAddiction,
-    SallysBlog.host(): SallysBlog,
     Saveur.host(): Saveur,
     SeriousEats.host(): SeriousEats,
     SimpleVeganista.host(): SimpleVeganista,

diff --git a/recipe_scrapers/_schemaorg.py b/recipe_scrapers/_schemaorg.py
@@ -3,6 +3,8 @@
 # find a package that parses https://schema.org/Recipe properly (or create one ourselves).
 
 
+from itertools import chain
+
 import extruct
 
 from recipe_scrapers.settings import settings
@@ -160,6 +162,10 @@ def ingredients(self):
         ingredients = (
             self.data.get("recipeIngredient") or self.data.get("ingredients") or []
         )
+
+        if ingredients and isinstance(ingredients[0], list):
+            ingredients = list(chain(*ingredients))  # flatten
+
         return [
             normalize_string(ingredient) for ingredient in ingredients if ingredient
         ]
@@ -206,6 +212,9 @@ def _extract_howto_instructions_text(self, schema_item):
     def instructions(self):
         instructions = self.data.get("recipeInstructions") or ""
 
+        if instructions and isinstance(instructions[0], list):
+            instructions = list(chain(*instructions))  # flatten
+
         if isinstance(instructions, list):
             instructions_gist = []
             for schema_instruction_item in instructions:
@@ -244,4 +253,6 @@ def description(self):
         description = self.data.get("description")
         if description is None:
             raise SchemaOrgException("No description data in SchemaOrg.")
+        if description and isinstance(description, list):
+            description = description[0]
         return normalize_string(description)
diff --git a/recipe_scrapers/_utils.py b/recipe_scrapers/_utils.py
@@ -20,7 +20,7 @@
 }
 
 TIME_REGEX = re.compile(
-    r"(\D*(?P<hours>[\d.\s/?¼½¾⅓⅔⅕⅖⅗]+)\s*(hours|hrs|hr|h|óra|:))?(\D*(?P<minutes>\d+)\s*(minutes|mins|min|m|perc|$))?",
+    r"(\D*(?P<days>\d+)\s*(days|D))?(\D*(?P<hours>[\d.\s/?¼½¾⅓⅔⅕⅖⅗]+)\s*(hours|hrs|hr|h|óra|:))?(\D*(?P<minutes>\d+)\s*(minutes|mins|min|m|perc|$))?",
     re.IGNORECASE,
 )
 
@@ -77,7 +77,11 @@ def get_minutes(element, return_zero_on_not_found=False):  # noqa: C901: TODO
 
     minutes = int(matched.groupdict().get("minutes") or 0)
     hours_matched = matched.groupdict().get("hours")
+    days_matched = matched.groupdict().get("days")
 
+    # workaround for formats like: 0D4H45M, that are not a valid iso8601 it seems
+    if days_matched:
+        minutes += 60 * 60 * float(days_matched.strip())
     if hours_matched:
         hours_matched = hours_matched.strip()
         if any([symbol in FRACTIONS.keys() for symbol in hours_matched]):

diff --git a/recipe_scrapers/comidinhasdochef.py b/recipe_scrapers/comidinhasdochef.py
@@ -1,6 +1,5 @@
 # mypy: disallow_untyped_defs=False
 from ._abstract import AbstractScraper
-from ._utils import normalize_string
 
 
 class ComidinhasDoChef(AbstractScraper):
@@ -9,36 +8,25 @@ def host(cls):
         return "comidinhasdochef.com"
 
     def author(self):
-        return self.soup.find("span", {"class": "theauthor"}).get_text(strip=True)
+        return self.schema.author()
 
     def title(self):
-        return self.soup.find("h1", {"class": "title"}).get_text()
+        return self.schema.title()
 
     def total_time(self):
         return self.schema.total_time()
 
     def yields(self):
-        yields = self.soup.find("span", {"itemprop": "recipeYield"})
-        return yields.get_text() if yields else None
+        return self.schema.yields()
 
     def image(self):
         return self.schema.image()
 
     def ingredients(self):
-        return [
-            normalize_string(ingredient.get_text())
-            for ingredient in self.soup.find_all("li", {"itemprop": "recipeIngredient"})
-        ]
+        return self.schema.ingredients()
 
     def instructions(self):
-        instructions = [
-            normalize_string(instruction.get_text(strip=True))
-            for instruction in self.soup.find_all(
-                "li", {"itemprop": "recipeInstructions"}
-            )
-        ]
-        return "\n".join(instructions)
+        return self.schema.instructions()
 
     def ratings(self):
-        rating = self.soup.find("span", {"itemprop": "ratingValue"}).get_text()
-        return round(float(rating), 2)
+        return self.schema.ratings()
diff --git a/recipe_scrapers/countryliving.py b/recipe_scrapers/countryliving.py
@@ -1,7 +1,6 @@
 # mypy: allow-untyped-defs
 
 from ._abstract import AbstractScraper
-from ._utils import get_minutes, get_yields, normalize_string
 
 
 class CountryLiving(AbstractScraper):
@@ -10,33 +9,16 @@ def host(cls):
         return "countryliving.com"
 
     def title(self):
-        return self.soup.find("h1", {"class": "content-hed recipe-hed"}).get_text()
-
-    def author(self):
-        return self.soup.find("span", {"rel": "author"}).get_text()
+        return self.schema.title()
 
     def total_time(self):
-        return get_minutes(
-            self.soup.find("span", {"class": "total-time-amount"}).parent
-        )
+        return self.schema.total_time()
 
     def yields(self):
-        yields = self.soup.find(
-            "div", {"class": "recipe-details-item yields"}
-        ).get_text()
-
-        return get_yields("{} servings".format(yields))
+        return self.schema.yields()
 
     def ingredients(self):
-        ingredients = self.soup.findAll("div", {"class": "ingredient-item"})
-
-        return [normalize_string(ingredient.get_text()) for ingredient in ingredients]
+        return self.schema.ingredients()
 
     def instructions(self):
-        instructions = self.soup.find("div", {"class": "direction-lists"}).find_all(
-            "li"
-        )
-
-        return "\n".join(
-            [normalize_string(instruction.get_text()) for instruction in instructions]
-        )
+        return self.schema.instructions()
diff --git a/recipe_scrapers/cucchiaio.py b/recipe_scrapers/cucchiaio.py
@@ -1,6 +1,5 @@
 # mypy: disallow_untyped_defs=False
 from ._abstract import AbstractScraper
-from ._utils import get_minutes, get_yields
 
 
 class Cucchiaio(AbstractScraper):
@@ -14,30 +13,11 @@ def author(self):
     def title(self):
         return self.schema.title()
 
-    def total_time(self):
-        block = self.soup.find("div", {"class": "scheda-ricetta-new"})
-        if block:
-            return sum(map(get_minutes, block.findAll("tr")))
-        return 0
-
-    def yields(self):
-        header = self.soup.find("td", string="PORZIONI")
-        if header:
-            value = header.find_next("td")
-            return get_yields(value)
-        return None
-
     def image(self):
-        data = self.soup.find("div", {"class": "auto"}).find("img", {"class": "image"})
-        if data:
-            data = data.get("src")
-        return data
+        return self.schema.image()
 
     def ingredients(self):
         return self.schema.ingredients()
 
     def instructions(self):
         return self.schema.instructions()
-
-    def ratings(self):
-        return None
diff --git a/recipe_scrapers/cuisineaz.py b/recipe_scrapers/cuisineaz.py
@@ -7,6 +7,9 @@ class CuisineAZ(AbstractScraper):
     def host(cls):
         return "cuisineaz.com"
 
+    def author(self):
+        return self.schema.author()
+
     def title(self):
         return self.schema.title()
 

diff --git a/recipe_scrapers/delish.py b/recipe_scrapers/delish.py
@@ -1,13 +1,5 @@
 # mypy: disallow_untyped_defs=False
-# delish.py
-# Written by J. Kwon
-# Freely released the code to recipe_scraper group
-# March 1st, 2020
-# ==========================================================
-
-
 from ._abstract import AbstractScraper
-from ._utils import get_minutes, get_yields, normalize_string
 
 
 class Delish(AbstractScraper):
@@ -16,38 +8,19 @@ def host(cls):
         return "delish.com"
 
     def title(self):
-        return normalize_string(self.soup.find("h1").get_text())
+        return self.schema.title()
 
-    # Return total time to complete dish in minutes (includes prep time)
     def total_time(self):
-        total_time_class = self.soup.find("span", {"class": "total-time-amount"})
-        return get_minutes(total_time_class)
+        return self.schema.total_time()
 
     def yields(self):
-        yields_class = self.soup.find("span", {"class": "yields-amount"})
-
-        return get_yields(yields_class)
+        return self.schema.yields()
 
     def image(self):
-        try:
-            # Case when image is at the top of the recipe content div
-            image = self.soup.find(
-                "div", {"class": "content-lede-image-wrap aspect-ratio-1x1"}
-            ).find("img")
-            return image["data-src"] if image else None
-
-        except Exception:
-            # If the image is not at the top, it will be found at the
-            # bottom of the recipe content div
-            image = self.soup.find("picture")
-            return image.find("source")["data-srcset"] if image else None
+        return self.schema.image()
 
     def ingredients(self):
-        ingredients = self.soup.findAll("div", {"class": "ingredient-item"})
-        return [normalize_string(ingredient.get_text()) for ingredient in ingredients]
+        return self.schema.ingredients()
 
     def instructions(self):
-        instructions = self.soup.find("div", {"class": "direction-lists"}).findAll("li")
-        return "\n".join(
-            [normalize_string(instruction.get_text()) for instruction in instructions]
-        )
+        return self.schema.instructions()
diff --git a/recipe_scrapers/finedininglovers.py b/recipe_scrapers/finedininglovers.py
@@ -1,7 +1,6 @@
 # mypy: allow-untyped-defs
 
 from ._abstract import AbstractScraper
-from ._utils import get_minutes, get_yields, normalize_string
 
 
 class FineDiningLovers(AbstractScraper):
@@ -10,51 +9,24 @@ def host(cls):
         return "finedininglovers.com"
 
     def title(self):
-        return self.soup.find("h1", {"class": "recipe-full-class"}).get_text()
+        return self.schema.title()
 
     def author(self):
         container = self.soup.find("div", {"class": "author-name"})
         if container:
             return container.find("a").get_text()
 
     def total_time(self):
-        return get_minutes(self.soup.find("div", {"class": "timing"}))
+        return self.schema.total_time()
 
     def yields(self):
-        yields = self.soup.find(
-            "div", {"class": "field--name-field-recipe-serving-num"}
-        )
-
-        return get_yields("{} servings".format(yields))
+        return self.schema.yields()
 
     def ingredients(self):
-        ingredients_parent = self.soup.find("div", {"class": "ingredients-box"})
-        ingredients = ingredients_parent.findAll(
-            "div", {"class": "paragraph--type--recipe-ingredient"}
-        )
-
-        return [normalize_string(ingredient.get_text()) for ingredient in ingredients]
+        return self.schema.ingredients()
 
     def instructions(self):
-        instructions_parent = self.soup.find(
-            "div", {"class": "field--name-field-recipe-para-steps"}
-        )
-
-        if instructions_parent is not None:
-            instructions = instructions_parent.findAll(
-                "div", {"class": "paragraph--type--recipe-step"}
-            )
-        else:
-            instructions_parent = self.soup.find("div", {"class": "ante-body"})
-            instructions = instructions_parent.findAll({"li", "p"})
-
-        return "\n".join(
-            [normalize_string(instruction.get_text()) for instruction in instructions]
-        )
+        return self.schema.instructions()
 
     def image(self):
-        image = self.soup.select_one(".image-zone picture img")
-        image_url = image["data-src"].split("?")[0]
-        image_base_url = "https://www.finedininglovers.com"
-
-        return "{}{}".format(image_base_url, image_url) if image else None
+        return self.schema.image()