Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Project Gezond scraper (Dutch Website) #691

Merged
merged 3 commits into from Dec 6, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions README.rst
Expand Up @@ -242,6 +242,7 @@ Scrapers available for:
- `https://popsugar.com/ <https://popsugar.com>`_
- `https://practicalselfreliance.com/ <https://practicalselfreliance.com>`_
- `https://www.primaledgehealth.com/ <https://www.primaledgehealth.com/>`_
- `https://www.projectgezond.nl/ <https://www.projectgezond.nl/>`_
- `https://przepisy.pl/ <https://przepisy.pl>`_
- `https://purelypope.com/ <https://purelypope.com>`_
- `https://purplecarrot.com/ <https://purplecarrot.com>`_
Expand Down
2 changes: 2 additions & 0 deletions recipe_scrapers/__init__.py
Expand Up @@ -157,6 +157,7 @@
from .popsugar import PopSugar
from .practicalselfreliance import PracticalSelfReliance
from .primaledgehealth import PrimalEdgeHealth
from .projectgezond import ProjectGezond
from .przepisy import Przepisy
from .purelypope import PurelyPope
from .purplecarrot import PurpleCarrot
Expand Down Expand Up @@ -391,6 +392,7 @@
PracticalSelfReliance.host(): PracticalSelfReliance,
PracticalSelfReliance.host(domain="creativecanning.com"): PracticalSelfReliance,
PrimalEdgeHealth.host(): PrimalEdgeHealth,
ProjectGezond.host(): ProjectGezond,
Przepisy.host(): Przepisy,
PurelyPope.host(): PurelyPope,
PurpleCarrot.host(): PurpleCarrot,
Expand Down
92 changes: 92 additions & 0 deletions recipe_scrapers/projectgezond.py
@@ -0,0 +1,92 @@
# mypy: allow-untyped-defs
import re

from ._abstract import AbstractScraper


class ProjectGezond(AbstractScraper):
@classmethod
def host(cls):
return "projectgezond.nl"

def author(self):
return "Project Gezond"

def title(self):
return self.soup.find("h1", {"class": "entry-title"}).text

def category(self):
return [
element.text
for element in self.soup.find("span", {"class": "meta-category"}).find_all(
"a", {"class": lambda x: x is not None and x.startswith("category")}
)
]

def total_time(self):
time_element = self.soup.find("em", string="Bereidingstijd:").parent
return "".join(
[
element.text
for element in time_element.children
if element.text != "Bereidingstijd:"
]
).strip()

def yields(self):
# Match everything in the h2 with 'Dit heb je nodig'
# The text inside the parentheses contains the yield for the ingredients that are listed
return re.search(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more thing to keep in mind - hopefully not important in this case, but in general - and sorry if I'm explaining things that you understand already, but it's worth being careful to limit what regular expressions can match on, and/or how much input text they are provided as input.

Just something I repeat (no pun intended) at nearly every available opportunity 😄

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your advice! I'm not really that familiar with regular expressions, so all tips are appreciated 😄

r"Dit heb je nodig \(([^)]+)",
self.soup.find(
"h2", string=lambda x: x.startswith("Dit heb je nodig")
).text,
).group(1)

def image(self):
return self.schema.image()

def ingredients(self):
ingredients_table = self.soup.find(
"h2", string=lambda x: x.startswith("Dit heb je nodig")
).next_sibling.next_sibling
ingredients = [
ingredient.text
for ingredient in ingredients_table
if ingredient.text.strip()
]
return ingredients

def instructions(self):
instructions_table = self.soup.find(
"h2", string=lambda x: x.startswith("Zo maak je het")
).next_sibling.next_sibling.next_sibling.next_sibling
instructions = [
instruction.text
for instruction in instructions_table
if instruction.text.strip()
]
return "\n".join(instructions).strip()

def ratings(self):
# Ratings do not exist on this site
return None

def cuisine(self):
# Not listed on site
return None

def description(self):
# Get the recipe's content start. The recipe will start with the description until
# we reach the instructions.
content_start = self.soup.find("div", {"class", "entry-content"})

description = ""
for content_element in content_start.children:
# If we reach this, the ingredients are listed and the description is complete
if content_element.text.startswith("Dit heb je nodig"):
break

description += content_element.text

return description.strip()
834 changes: 834 additions & 0 deletions tests/test_data/projectgezond.testhtml

Large diffs are not rendered by default.

72 changes: 72 additions & 0 deletions tests/test_projectgezond.py
@@ -0,0 +1,72 @@
# mypy: allow-untyped-defs

from recipe_scrapers.projectgezond import ProjectGezond
from tests import ScraperTest


class TestProjectGezondScraper(ScraperTest):

scraper_class = ProjectGezond

def test_host(self):
self.assertEqual("projectgezond.nl", self.harvester_class.host())

def test_author(self):
self.assertEqual("Project Gezond", self.harvester_class.author())

def test_title(self):
self.assertEqual("Boeuf bourguignon", self.harvester_class.title())

def test_category(self):
self.assertEqual(["Diner", "Kerstrecepten"], self.harvester_class.category())
jayaddison marked this conversation as resolved.
Show resolved Hide resolved

def test_total_time(self):
self.assertEqual("30 minuten + 2 uur stoven", self.harvester_class.total_time())

def test_yields(self):
self.assertEqual("twee personen", self.harvester_class.yields())

def test_image(self):
self.assertEqual(
"https://www.projectgezond.nl/wp-content/uploads/2021/11/BoeufBourguignon-768x1024.jpg",
self.harvester_class.image(),
)

def test_ingredients(self):
self.assertEqual(
[
"40 gr ontbijtspek",
"250 gr runderriblappen",
"10 gr bloem",
"1 ui",
"150 gr winterpeen",
"1 teentje knoflook",
"35 gr tomatenpuree",
"100 ml rode wijn",
"200 ml runderbouillon",
"1 laurierblaadje",
"1 takje tijm",
"1 kruidnagel",
"150 gr champignons",
"50 gr zilveruitjes",
],
self.harvester_class.ingredients(),
)

def test_instructions(self):
self.assertEqual(
"Bak de plakken ontbijtspek bruin en licht krokant in een droge (stoof)pan. Haal uit de pan en zorg dat het bakvet achterblijft.\nSnijd de runderriblappen in blokjes van 2 bij 2 centimeter. Bestrooi met peper, zout en de bloem. Schep om tot alles goed verdeeld is. \nSnijd de ui in halve ringen en de winterpeen in plakken.\nHak de knoflook fijn. \nVerwarm de pan waar het ontbijtspek in gebakken is opnieuw. Bak de riblappen op hoog vuur rondom bruin. \nGebruik indien nodig een klein beetje boter of olijfolie. \nVoeg de ui en winterpeen toe en bak enkele minuten mee met de blokjes vlees.\nZet het vuur lager en voeg de knoflook toe. Bak 1 à 2 minuten mee. Voeg de tomatenpuree toe. Roer los en bak 2 à 3 minuten mee.\nBlus af met de rode wijn. Roer eventuele aanbaksels los van de bodem. Laat de wijn grotendeels verdampen.\nVoeg de runderbouillon, het laurierblaadje, het takje tijm en de kruidnagel toe.\nSnijd de plakken ontbijtspek in stukjes en voeg toe.\nLaat het gerecht ongeveer 2 uur stoven met de deksel op de pan. \nBoen de champignons schoon en snijd ze in kwarten. \nSnijd de zilveruitjes doormidden. \nVoeg de champignons en de zilveruitjes toe.\nLaat alles nog minimaal 30 minuten stoven. Doe dit eventueel zonder deksel op de pan, zodat de boeuf bourguignon wat verder inkookt. \nHaal het laurierblaadje, het takje tijm en de kruidnagel uit de pan.",
self.harvester_class.instructions(),
)

def test_ratings(self):
self.assertEqual(None, self.harvester_class.ratings())

def test_cuisine(self):
self.assertEqual(None, self.harvester_class.cuisine())

def test_description(self):
self.assertEqual(
"Deze klassieker vindt zijn oorsprong in de Franse keuken. ‘Rund op bourgondische wijze’ is een vertaling van ‘Boeuf bourguignon’ die niets aan de verbeelding overlaat.\nDit recept is dan ook het perfecte antwoord als het weer tijd is voor een potje stoof!\nWant het is verre van moeilijk om deze ultieme stoofpot te bereiden. Je hebt enkel wat (wacht)tijd en dus geduld nodig. Het is helemaal geen gek idee dat je dit recept al de dag van tevoren klaarmaakt trouwens. De smaken kunnen dan zelfs nog beter intrekken.\nAls dat geen ‘Boeuf bourguignon’ wordt…",
self.harvester_class.description(),
)