Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Project Gezond scraper (Dutch Website) #691

Merged
merged 3 commits into from Dec 6, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
28 changes: 14 additions & 14 deletions recipe_scrapers/projectgezond.py
Expand Up @@ -13,11 +13,11 @@ def author(self):
return "Project Gezond"

def title(self):
return self.soup.find("h1", {"class": "entry-title"}).get_text()
return self.soup.find("h1", {"class": "entry-title"}).text

def category(self):
return [
element.get_text()
element.text
for element in self.soup.find("span", {"class": "meta-category"}).find_all(
"a", {"class": lambda x: x is not None and x.startswith("category")}
)
Expand All @@ -27,44 +27,44 @@ def total_time(self):
time_element = self.soup.find("em", string="Bereidingstijd:").parent
return "".join(
[
element.get_text()
element.text
for element in time_element.children
if element.get_text() != "Bereidingstijd:"
if element.text != "Bereidingstijd:"
]
).strip()

def yields(self):
# Match everything in the h2 with 'Dit heb je nodig'
# The text inside the parentheses contains the yield for the ingredients that are listed
return re.search(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more thing to keep in mind - hopefully not important in this case, but in general - and sorry if I'm explaining things that you understand already, but it's worth being careful to limit what regular expressions can match on, and/or how much input text they are provided as input.

Just something I repeat (no pun intended) at nearly every available opportunity 😄

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your advice! I'm not really that familiar with regular expressions, so all tips are appreciated 😄

r"\(([^)]+)",
r"Dit heb je nodig \(([^)]+)",
self.soup.find(
"h2", string=lambda x: x.startswith("Dit heb je nodig")
).get_text(),
).text,
).group(1)

def image(self):
return self.schema.image()

def ingredients(self):
# Sadly, there is no other way to find the start of the ingredients than to look for the string content
ingredients_table = self.soup.find(
"h2", string=lambda x: x.startswith("Dit heb je nodig")
).next_sibling.next_sibling
ingredients = [
ingredient.get_text()
ingredient.text
for ingredient in ingredients_table
if ingredient.get_text().strip()
if ingredient.text.strip()
]
return ingredients

def instructions(self):
# Sadly, there is no other way to find the start of the instructions than to look for the string content
instructions_table = self.soup.find(
"h2", string=lambda x: x.startswith("Zo maak je het")
).next_sibling.next_sibling.next_sibling.next_sibling
instructions = [
instruction.get_text()
instruction.text
for instruction in instructions_table
if instruction.get_text().strip()
if instruction.text.strip()
]
return "\n".join(instructions).strip()

Expand All @@ -84,9 +84,9 @@ def description(self):
description = ""
for content_element in content_start.children:
# If we reach this, the ingredients are listed and the description is complete
if content_element.get_text().startswith("Dit heb je nodig"):
if content_element.text.startswith("Dit heb je nodig"):
break

description += content_element.get_text()
description += content_element.text

return description.strip()
2 changes: 1 addition & 1 deletion tests/test_projectgezond.py
Expand Up @@ -18,7 +18,7 @@ def test_title(self):
self.assertEqual("Boeuf bourguignon", self.harvester_class.title())

def test_category(self):
self.assertEqual("Diner\nKerstrecepten", self.harvester_class.category())
self.assertEqual(["Diner", "Kerstrecepten"], self.harvester_class.category())
jayaddison marked this conversation as resolved.
Show resolved Hide resolved

def test_total_time(self):
self.assertEqual("30 minuten + 2 uur stoven", self.harvester_class.total_time())
Expand Down