Skip to content

Commit

Permalink
System health. Scrapers needed polishing (#676)
Browse files Browse the repository at this point in the history
* Health Check. A LOT of scrapers needed polishing

- Adds creativecanning.com/ (same scraper as PracticalSelfReliance)
- Removes sallys-blog.de. Scraper did not work. Site is tricky to scrape.

Good news is that a lot of scrapers needed polishing because they have schema available!

* Post merge with main branch fixes. (needed due to design changes in sites)
  • Loading branch information
hhursev committed Oct 30, 2022
1 parent 1582886 commit fa61cf2
Show file tree
Hide file tree
Showing 252 changed files with 83,246 additions and 98,772 deletions.
2 changes: 1 addition & 1 deletion README.rst
Expand Up @@ -122,6 +122,7 @@ Scrapers available for:
- `https://www.coop.se/ <https://www.coop.se/>`_
- `https://copykat.com/ <https://copykat.com>`_
- `https://countryliving.com/ <https://countryliving.com>`_
- `https://creativecanning.com/ <https://creativecanning.com>`_
- `https://cucchiaio.it/ <https://cucchiaio.it>`_
- `https://cuisineaz.com/ <https://cuisineaz.com>`_
- `https://cybercook.com.br/ <https://cybercook.com.br/>`_
Expand Down Expand Up @@ -253,7 +254,6 @@ Scrapers available for:
- `https://rezeptwelt.de/ <https://rezeptwelt.de>`_
- `https://rosannapansino.com <https://rosannapansino.com>`_
- `https://sallysbakingaddiction.com <https://sallysbakingaddiction.com/>`_
- `https://sallys-blog.de <https://sallys-blog.de/>`_
- `https://www.saveur.com/ <https://www.saveur.com/>`_
- `https://seriouseats.com/ <https://seriouseats.com>`_
- `https://simple-veganista.com/ <https://simple-veganista.com/>`_
Expand Down
3 changes: 1 addition & 2 deletions recipe_scrapers/__init__.py
Expand Up @@ -169,7 +169,6 @@
from .rezeptwelt import Rezeptwelt
from .rosannapansino import RosannaPansino
from .sallysbakingaddiction import SallysBakingAddiction
from .sallysblog import SallysBlog
from .saveur import Saveur
from .seriouseats import SeriousEats
from .simpleveganista import SimpleVeganista
Expand Down Expand Up @@ -386,6 +385,7 @@
PingoDoce.host(): PingoDoce,
PopSugar.host(): PopSugar,
PracticalSelfReliance.host(): PracticalSelfReliance,
PracticalSelfReliance.host(domain="creativecanning.com"): PracticalSelfReliance,
PrimalEdgeHealth.host(): PrimalEdgeHealth,
Przepisy.host(): Przepisy,
PurelyPope.host(): PurelyPope,
Expand All @@ -401,7 +401,6 @@
Rezeptwelt.host(): Rezeptwelt,
RosannaPansino.host(): RosannaPansino,
SallysBakingAddiction.host(): SallysBakingAddiction,
SallysBlog.host(): SallysBlog,
Saveur.host(): Saveur,
SeriousEats.host(): SeriousEats,
SimpleVeganista.host(): SimpleVeganista,
Expand Down
11 changes: 11 additions & 0 deletions recipe_scrapers/_schemaorg.py
Expand Up @@ -3,6 +3,8 @@
# find a package that parses https://schema.org/Recipe properly (or create one ourselves).


from itertools import chain

import extruct

from recipe_scrapers.settings import settings
Expand Down Expand Up @@ -160,6 +162,10 @@ def ingredients(self):
ingredients = (
self.data.get("recipeIngredient") or self.data.get("ingredients") or []
)

if ingredients and isinstance(ingredients[0], list):
ingredients = list(chain(*ingredients)) # flatten

return [
normalize_string(ingredient) for ingredient in ingredients if ingredient
]
Expand Down Expand Up @@ -206,6 +212,9 @@ def _extract_howto_instructions_text(self, schema_item):
def instructions(self):
instructions = self.data.get("recipeInstructions") or ""

if instructions and isinstance(instructions[0], list):
instructions = list(chain(*instructions)) # flatten

if isinstance(instructions, list):
instructions_gist = []
for schema_instruction_item in instructions:
Expand Down Expand Up @@ -244,4 +253,6 @@ def description(self):
description = self.data.get("description")
if description is None:
raise SchemaOrgException("No description data in SchemaOrg.")
if description and isinstance(description, list):
description = description[0]
return normalize_string(description)
6 changes: 5 additions & 1 deletion recipe_scrapers/_utils.py
Expand Up @@ -20,7 +20,7 @@
}

TIME_REGEX = re.compile(
r"(\D*(?P<hours>[\d.\s/?¼½¾⅓⅔⅕⅖⅗]+)\s*(hours|hrs|hr|h|óra|:))?(\D*(?P<minutes>\d+)\s*(minutes|mins|min|m|perc|$))?",
r"(\D*(?P<days>\d+)\s*(days|D))?(\D*(?P<hours>[\d.\s/?¼½¾⅓⅔⅕⅖⅗]+)\s*(hours|hrs|hr|h|óra|:))?(\D*(?P<minutes>\d+)\s*(minutes|mins|min|m|perc|$))?",
re.IGNORECASE,
)

Expand Down Expand Up @@ -77,7 +77,11 @@ def get_minutes(element, return_zero_on_not_found=False): # noqa: C901: TODO

minutes = int(matched.groupdict().get("minutes") or 0)
hours_matched = matched.groupdict().get("hours")
days_matched = matched.groupdict().get("days")

# workaround for formats like: 0D4H45M, that are not a valid iso8601 it seems
if days_matched:
minutes += 60 * 60 * float(days_matched.strip())
if hours_matched:
hours_matched = hours_matched.strip()
if any([symbol in FRACTIONS.keys() for symbol in hours_matched]):
Expand Down
24 changes: 6 additions & 18 deletions recipe_scrapers/comidinhasdochef.py
@@ -1,6 +1,5 @@
# mypy: disallow_untyped_defs=False
from ._abstract import AbstractScraper
from ._utils import normalize_string


class ComidinhasDoChef(AbstractScraper):
Expand All @@ -9,36 +8,25 @@ def host(cls):
return "comidinhasdochef.com"

def author(self):
return self.soup.find("span", {"class": "theauthor"}).get_text(strip=True)
return self.schema.author()

def title(self):
return self.soup.find("h1", {"class": "title"}).get_text()
return self.schema.title()

def total_time(self):
return self.schema.total_time()

def yields(self):
yields = self.soup.find("span", {"itemprop": "recipeYield"})
return yields.get_text() if yields else None
return self.schema.yields()

def image(self):
return self.schema.image()

def ingredients(self):
return [
normalize_string(ingredient.get_text())
for ingredient in self.soup.find_all("li", {"itemprop": "recipeIngredient"})
]
return self.schema.ingredients()

def instructions(self):
instructions = [
normalize_string(instruction.get_text(strip=True))
for instruction in self.soup.find_all(
"li", {"itemprop": "recipeInstructions"}
)
]
return "\n".join(instructions)
return self.schema.instructions()

def ratings(self):
rating = self.soup.find("span", {"itemprop": "ratingValue"}).get_text()
return round(float(rating), 2)
return self.schema.ratings()
28 changes: 5 additions & 23 deletions recipe_scrapers/countryliving.py
@@ -1,7 +1,6 @@
# mypy: allow-untyped-defs

from ._abstract import AbstractScraper
from ._utils import get_minutes, get_yields, normalize_string


class CountryLiving(AbstractScraper):
Expand All @@ -10,33 +9,16 @@ def host(cls):
return "countryliving.com"

def title(self):
return self.soup.find("h1", {"class": "content-hed recipe-hed"}).get_text()

def author(self):
return self.soup.find("span", {"rel": "author"}).get_text()
return self.schema.title()

def total_time(self):
return get_minutes(
self.soup.find("span", {"class": "total-time-amount"}).parent
)
return self.schema.total_time()

def yields(self):
yields = self.soup.find(
"div", {"class": "recipe-details-item yields"}
).get_text()

return get_yields("{} servings".format(yields))
return self.schema.yields()

def ingredients(self):
ingredients = self.soup.findAll("div", {"class": "ingredient-item"})

return [normalize_string(ingredient.get_text()) for ingredient in ingredients]
return self.schema.ingredients()

def instructions(self):
instructions = self.soup.find("div", {"class": "direction-lists"}).find_all(
"li"
)

return "\n".join(
[normalize_string(instruction.get_text()) for instruction in instructions]
)
return self.schema.instructions()
22 changes: 1 addition & 21 deletions recipe_scrapers/cucchiaio.py
@@ -1,6 +1,5 @@
# mypy: disallow_untyped_defs=False
from ._abstract import AbstractScraper
from ._utils import get_minutes, get_yields


class Cucchiaio(AbstractScraper):
Expand All @@ -14,30 +13,11 @@ def author(self):
def title(self):
return self.schema.title()

def total_time(self):
block = self.soup.find("div", {"class": "scheda-ricetta-new"})
if block:
return sum(map(get_minutes, block.findAll("tr")))
return 0

def yields(self):
header = self.soup.find("td", string="PORZIONI")
if header:
value = header.find_next("td")
return get_yields(value)
return None

def image(self):
data = self.soup.find("div", {"class": "auto"}).find("img", {"class": "image"})
if data:
data = data.get("src")
return data
return self.schema.image()

def ingredients(self):
return self.schema.ingredients()

def instructions(self):
return self.schema.instructions()

def ratings(self):
return None
3 changes: 3 additions & 0 deletions recipe_scrapers/cuisineaz.py
Expand Up @@ -7,6 +7,9 @@ class CuisineAZ(AbstractScraper):
def host(cls):
return "cuisineaz.com"

def author(self):
return self.schema.author()

def title(self):
return self.schema.title()

Expand Down
39 changes: 6 additions & 33 deletions recipe_scrapers/delish.py
@@ -1,13 +1,5 @@
# mypy: disallow_untyped_defs=False
# delish.py
# Written by J. Kwon
# Freely released the code to recipe_scraper group
# March 1st, 2020
# ==========================================================


from ._abstract import AbstractScraper
from ._utils import get_minutes, get_yields, normalize_string


class Delish(AbstractScraper):
Expand All @@ -16,38 +8,19 @@ def host(cls):
return "delish.com"

def title(self):
return normalize_string(self.soup.find("h1").get_text())
return self.schema.title()

# Return total time to complete dish in minutes (includes prep time)
def total_time(self):
total_time_class = self.soup.find("span", {"class": "total-time-amount"})
return get_minutes(total_time_class)
return self.schema.total_time()

def yields(self):
yields_class = self.soup.find("span", {"class": "yields-amount"})

return get_yields(yields_class)
return self.schema.yields()

def image(self):
try:
# Case when image is at the top of the recipe content div
image = self.soup.find(
"div", {"class": "content-lede-image-wrap aspect-ratio-1x1"}
).find("img")
return image["data-src"] if image else None

except Exception:
# If the image is not at the top, it will be found at the
# bottom of the recipe content div
image = self.soup.find("picture")
return image.find("source")["data-srcset"] if image else None
return self.schema.image()

def ingredients(self):
ingredients = self.soup.findAll("div", {"class": "ingredient-item"})
return [normalize_string(ingredient.get_text()) for ingredient in ingredients]
return self.schema.ingredients()

def instructions(self):
instructions = self.soup.find("div", {"class": "direction-lists"}).findAll("li")
return "\n".join(
[normalize_string(instruction.get_text()) for instruction in instructions]
)
return self.schema.instructions()
40 changes: 6 additions & 34 deletions recipe_scrapers/finedininglovers.py
@@ -1,7 +1,6 @@
# mypy: allow-untyped-defs

from ._abstract import AbstractScraper
from ._utils import get_minutes, get_yields, normalize_string


class FineDiningLovers(AbstractScraper):
Expand All @@ -10,51 +9,24 @@ def host(cls):
return "finedininglovers.com"

def title(self):
return self.soup.find("h1", {"class": "recipe-full-class"}).get_text()
return self.schema.title()

def author(self):
container = self.soup.find("div", {"class": "author-name"})
if container:
return container.find("a").get_text()

def total_time(self):
return get_minutes(self.soup.find("div", {"class": "timing"}))
return self.schema.total_time()

def yields(self):
yields = self.soup.find(
"div", {"class": "field--name-field-recipe-serving-num"}
)

return get_yields("{} servings".format(yields))
return self.schema.yields()

def ingredients(self):
ingredients_parent = self.soup.find("div", {"class": "ingredients-box"})
ingredients = ingredients_parent.findAll(
"div", {"class": "paragraph--type--recipe-ingredient"}
)

return [normalize_string(ingredient.get_text()) for ingredient in ingredients]
return self.schema.ingredients()

def instructions(self):
instructions_parent = self.soup.find(
"div", {"class": "field--name-field-recipe-para-steps"}
)

if instructions_parent is not None:
instructions = instructions_parent.findAll(
"div", {"class": "paragraph--type--recipe-step"}
)
else:
instructions_parent = self.soup.find("div", {"class": "ante-body"})
instructions = instructions_parent.findAll({"li", "p"})

return "\n".join(
[normalize_string(instruction.get_text()) for instruction in instructions]
)
return self.schema.instructions()

def image(self):
image = self.soup.select_one(".image-zone picture img")
image_url = image["data-src"].split("?")[0]
image_base_url = "https://www.finedininglovers.com"

return "{}{}".format(image_base_url, image_url) if image else None
return self.schema.image()

0 comments on commit fa61cf2

Please sign in to comment.