Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

System health. Scrapers needed polishing #676

Merged
merged 4 commits into from Oct 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.rst
Expand Up @@ -122,6 +122,7 @@ Scrapers available for:
- `https://www.coop.se/ <https://www.coop.se/>`_
- `https://copykat.com/ <https://copykat.com>`_
- `https://countryliving.com/ <https://countryliving.com>`_
- `https://creativecanning.com/ <https://creativecanning.com>`_
- `https://cucchiaio.it/ <https://cucchiaio.it>`_
- `https://cuisineaz.com/ <https://cuisineaz.com>`_
- `https://cybercook.com.br/ <https://cybercook.com.br/>`_
Expand Down Expand Up @@ -253,7 +254,6 @@ Scrapers available for:
- `https://rezeptwelt.de/ <https://rezeptwelt.de>`_
- `https://rosannapansino.com <https://rosannapansino.com>`_
- `https://sallysbakingaddiction.com <https://sallysbakingaddiction.com/>`_
- `https://sallys-blog.de <https://sallys-blog.de/>`_
- `https://www.saveur.com/ <https://www.saveur.com/>`_
- `https://seriouseats.com/ <https://seriouseats.com>`_
- `https://simple-veganista.com/ <https://simple-veganista.com/>`_
Expand Down
3 changes: 1 addition & 2 deletions recipe_scrapers/__init__.py
Expand Up @@ -169,7 +169,6 @@
from .rezeptwelt import Rezeptwelt
from .rosannapansino import RosannaPansino
from .sallysbakingaddiction import SallysBakingAddiction
from .sallysblog import SallysBlog
from .saveur import Saveur
from .seriouseats import SeriousEats
from .simpleveganista import SimpleVeganista
Expand Down Expand Up @@ -386,6 +385,7 @@
PingoDoce.host(): PingoDoce,
PopSugar.host(): PopSugar,
PracticalSelfReliance.host(): PracticalSelfReliance,
PracticalSelfReliance.host(domain="creativecanning.com"): PracticalSelfReliance,
PrimalEdgeHealth.host(): PrimalEdgeHealth,
Przepisy.host(): Przepisy,
PurelyPope.host(): PurelyPope,
Expand All @@ -401,7 +401,6 @@
Rezeptwelt.host(): Rezeptwelt,
RosannaPansino.host(): RosannaPansino,
SallysBakingAddiction.host(): SallysBakingAddiction,
SallysBlog.host(): SallysBlog,
Saveur.host(): Saveur,
SeriousEats.host(): SeriousEats,
SimpleVeganista.host(): SimpleVeganista,
Expand Down
11 changes: 11 additions & 0 deletions recipe_scrapers/_schemaorg.py
Expand Up @@ -3,6 +3,8 @@
# find a package that parses https://schema.org/Recipe properly (or create one ourselves).


from itertools import chain

import extruct

from recipe_scrapers.settings import settings
Expand Down Expand Up @@ -160,6 +162,10 @@ def ingredients(self):
ingredients = (
self.data.get("recipeIngredient") or self.data.get("ingredients") or []
)

if ingredients and isinstance(ingredients[0], list):
ingredients = list(chain(*ingredients)) # flatten

return [
normalize_string(ingredient) for ingredient in ingredients if ingredient
]
Expand Down Expand Up @@ -206,6 +212,9 @@ def _extract_howto_instructions_text(self, schema_item):
def instructions(self):
instructions = self.data.get("recipeInstructions") or ""

if instructions and isinstance(instructions[0], list):
instructions = list(chain(*instructions)) # flatten

if isinstance(instructions, list):
instructions_gist = []
for schema_instruction_item in instructions:
Expand Down Expand Up @@ -244,4 +253,6 @@ def description(self):
description = self.data.get("description")
if description is None:
raise SchemaOrgException("No description data in SchemaOrg.")
if description and isinstance(description, list):
description = description[0]
return normalize_string(description)
6 changes: 5 additions & 1 deletion recipe_scrapers/_utils.py
Expand Up @@ -20,7 +20,7 @@
}

TIME_REGEX = re.compile(
r"(\D*(?P<hours>[\d.\s/?¼½¾⅓⅔⅕⅖⅗]+)\s*(hours|hrs|hr|h|óra|:))?(\D*(?P<minutes>\d+)\s*(minutes|mins|min|m|perc|$))?",
r"(\D*(?P<days>\d+)\s*(days|D))?(\D*(?P<hours>[\d.\s/?¼½¾⅓⅔⅕⅖⅗]+)\s*(hours|hrs|hr|h|óra|:))?(\D*(?P<minutes>\d+)\s*(minutes|mins|min|m|perc|$))?",
re.IGNORECASE,
)

Expand Down Expand Up @@ -77,7 +77,11 @@ def get_minutes(element, return_zero_on_not_found=False): # noqa: C901: TODO

minutes = int(matched.groupdict().get("minutes") or 0)
hours_matched = matched.groupdict().get("hours")
days_matched = matched.groupdict().get("days")

# workaround for formats like: 0D4H45M, that are not a valid iso8601 it seems
if days_matched:
minutes += 60 * 60 * float(days_matched.strip())
if hours_matched:
hours_matched = hours_matched.strip()
if any([symbol in FRACTIONS.keys() for symbol in hours_matched]):
Expand Down
24 changes: 6 additions & 18 deletions recipe_scrapers/comidinhasdochef.py
@@ -1,6 +1,5 @@
# mypy: disallow_untyped_defs=False
from ._abstract import AbstractScraper
from ._utils import normalize_string


class ComidinhasDoChef(AbstractScraper):
Expand All @@ -9,36 +8,25 @@ def host(cls):
return "comidinhasdochef.com"

def author(self):
return self.soup.find("span", {"class": "theauthor"}).get_text(strip=True)
return self.schema.author()

def title(self):
return self.soup.find("h1", {"class": "title"}).get_text()
return self.schema.title()

def total_time(self):
return self.schema.total_time()

def yields(self):
yields = self.soup.find("span", {"itemprop": "recipeYield"})
return yields.get_text() if yields else None
return self.schema.yields()

def image(self):
return self.schema.image()

def ingredients(self):
return [
normalize_string(ingredient.get_text())
for ingredient in self.soup.find_all("li", {"itemprop": "recipeIngredient"})
]
return self.schema.ingredients()

def instructions(self):
instructions = [
normalize_string(instruction.get_text(strip=True))
for instruction in self.soup.find_all(
"li", {"itemprop": "recipeInstructions"}
)
]
return "\n".join(instructions)
return self.schema.instructions()

def ratings(self):
rating = self.soup.find("span", {"itemprop": "ratingValue"}).get_text()
return round(float(rating), 2)
return self.schema.ratings()
28 changes: 5 additions & 23 deletions recipe_scrapers/countryliving.py
@@ -1,7 +1,6 @@
# mypy: allow-untyped-defs

from ._abstract import AbstractScraper
from ._utils import get_minutes, get_yields, normalize_string


class CountryLiving(AbstractScraper):
Expand All @@ -10,33 +9,16 @@ def host(cls):
return "countryliving.com"

def title(self):
return self.soup.find("h1", {"class": "content-hed recipe-hed"}).get_text()

def author(self):
return self.soup.find("span", {"rel": "author"}).get_text()
return self.schema.title()

def total_time(self):
return get_minutes(
self.soup.find("span", {"class": "total-time-amount"}).parent
)
return self.schema.total_time()

def yields(self):
yields = self.soup.find(
"div", {"class": "recipe-details-item yields"}
).get_text()

return get_yields("{} servings".format(yields))
return self.schema.yields()

def ingredients(self):
ingredients = self.soup.findAll("div", {"class": "ingredient-item"})

return [normalize_string(ingredient.get_text()) for ingredient in ingredients]
return self.schema.ingredients()

def instructions(self):
instructions = self.soup.find("div", {"class": "direction-lists"}).find_all(
"li"
)

return "\n".join(
[normalize_string(instruction.get_text()) for instruction in instructions]
)
return self.schema.instructions()
22 changes: 1 addition & 21 deletions recipe_scrapers/cucchiaio.py
@@ -1,6 +1,5 @@
# mypy: disallow_untyped_defs=False
from ._abstract import AbstractScraper
from ._utils import get_minutes, get_yields


class Cucchiaio(AbstractScraper):
Expand All @@ -14,30 +13,11 @@ def author(self):
def title(self):
return self.schema.title()

def total_time(self):
block = self.soup.find("div", {"class": "scheda-ricetta-new"})
if block:
return sum(map(get_minutes, block.findAll("tr")))
return 0

def yields(self):
header = self.soup.find("td", string="PORZIONI")
if header:
value = header.find_next("td")
return get_yields(value)
return None

def image(self):
data = self.soup.find("div", {"class": "auto"}).find("img", {"class": "image"})
if data:
data = data.get("src")
return data
return self.schema.image()

def ingredients(self):
return self.schema.ingredients()

def instructions(self):
return self.schema.instructions()

def ratings(self):
return None
3 changes: 3 additions & 0 deletions recipe_scrapers/cuisineaz.py
Expand Up @@ -7,6 +7,9 @@ class CuisineAZ(AbstractScraper):
def host(cls):
return "cuisineaz.com"

def author(self):
return self.schema.author()

def title(self):
return self.schema.title()

Expand Down
39 changes: 6 additions & 33 deletions recipe_scrapers/delish.py
@@ -1,13 +1,5 @@
# mypy: disallow_untyped_defs=False
# delish.py
# Written by J. Kwon
# Freely released the code to recipe_scraper group
# March 1st, 2020
# ==========================================================


from ._abstract import AbstractScraper
from ._utils import get_minutes, get_yields, normalize_string


class Delish(AbstractScraper):
Expand All @@ -16,38 +8,19 @@ def host(cls):
return "delish.com"

def title(self):
return normalize_string(self.soup.find("h1").get_text())
return self.schema.title()

# Return total time to complete dish in minutes (includes prep time)
def total_time(self):
total_time_class = self.soup.find("span", {"class": "total-time-amount"})
return get_minutes(total_time_class)
return self.schema.total_time()

def yields(self):
yields_class = self.soup.find("span", {"class": "yields-amount"})

return get_yields(yields_class)
return self.schema.yields()

def image(self):
try:
# Case when image is at the top of the recipe content div
image = self.soup.find(
"div", {"class": "content-lede-image-wrap aspect-ratio-1x1"}
).find("img")
return image["data-src"] if image else None

except Exception:
# If the image is not at the top, it will be found at the
# bottom of the recipe content div
image = self.soup.find("picture")
return image.find("source")["data-srcset"] if image else None
return self.schema.image()

def ingredients(self):
ingredients = self.soup.findAll("div", {"class": "ingredient-item"})
return [normalize_string(ingredient.get_text()) for ingredient in ingredients]
return self.schema.ingredients()

def instructions(self):
instructions = self.soup.find("div", {"class": "direction-lists"}).findAll("li")
return "\n".join(
[normalize_string(instruction.get_text()) for instruction in instructions]
)
return self.schema.instructions()
40 changes: 6 additions & 34 deletions recipe_scrapers/finedininglovers.py
@@ -1,7 +1,6 @@
# mypy: allow-untyped-defs

from ._abstract import AbstractScraper
from ._utils import get_minutes, get_yields, normalize_string


class FineDiningLovers(AbstractScraper):
Expand All @@ -10,51 +9,24 @@ def host(cls):
return "finedininglovers.com"

def title(self):
return self.soup.find("h1", {"class": "recipe-full-class"}).get_text()
return self.schema.title()

def author(self):
container = self.soup.find("div", {"class": "author-name"})
if container:
return container.find("a").get_text()

def total_time(self):
return get_minutes(self.soup.find("div", {"class": "timing"}))
return self.schema.total_time()

def yields(self):
yields = self.soup.find(
"div", {"class": "field--name-field-recipe-serving-num"}
)

return get_yields("{} servings".format(yields))
return self.schema.yields()

def ingredients(self):
ingredients_parent = self.soup.find("div", {"class": "ingredients-box"})
ingredients = ingredients_parent.findAll(
"div", {"class": "paragraph--type--recipe-ingredient"}
)

return [normalize_string(ingredient.get_text()) for ingredient in ingredients]
return self.schema.ingredients()

def instructions(self):
instructions_parent = self.soup.find(
"div", {"class": "field--name-field-recipe-para-steps"}
)

if instructions_parent is not None:
instructions = instructions_parent.findAll(
"div", {"class": "paragraph--type--recipe-step"}
)
else:
instructions_parent = self.soup.find("div", {"class": "ante-body"})
instructions = instructions_parent.findAll({"li", "p"})

return "\n".join(
[normalize_string(instruction.get_text()) for instruction in instructions]
)
return self.schema.instructions()

def image(self):
image = self.soup.select_one(".image-zone picture img")
image_url = image["data-src"].split("?")[0]
image_base_url = "https://www.finedininglovers.com"

return "{}{}".format(image_base_url, image_url) if image else None
return self.schema.image()