Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Ploetzblog + configuration for unit tests #1100

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ Scrapers available for:
- `https://www.pingodoce.pt/ <https://www.pingodoce.pt>`_
- `https://pinkowlkitchen.com/ <https://pinkowlkitchen.com/>`_
- `https://www.platingpixels.com/ <https://www.platingpixels.com/>`_
- `https://www.ploetzblog.de/ <https://www.ploetzblog.de/>`_
- `https://plowingthroughlife.com/ <https://plowingthroughlife.com/>`_
- `https://popsugar.com/ <https://popsugar.com>`_
- `https://practicalselfreliance.com/ <https://practicalselfreliance.com>`_
Expand Down
2 changes: 2 additions & 0 deletions recipe_scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@
from .pingodoce import PingoDoce
from .pinkowlkitchen import PinkOwlKitchen
from .platingpixels import PlatingPixels
from .ploetzblog import Ploetzblog
from .plowingthroughlife import PlowingThroughLife
from .popsugar import PopSugar
from .practicalselfreliance import PracticalSelfReliance
Expand Down Expand Up @@ -388,6 +389,7 @@
MyJewishLearning.host(): MyJewishLearning,
NutritionFacts.host(): NutritionFacts,
PinchOfYum.host(): PinchOfYum,
Ploetzblog.host(): Ploetzblog,
Recept.host(): Recept,
RicettePerBimby.host(): RicettePerBimby,
StrongrFastr.host(): StrongrFastr,
Expand Down
142 changes: 142 additions & 0 deletions recipe_scrapers/ploetzblog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# mypy: allow-untyped-defs

import re
from typing import List

from ._abstract import AbstractScraper
from ._grouping_utils import IngredientGroup
from ._utils import normalize_string


class Ploetzblog(AbstractScraper):
@classmethod
def host(cls):
return "ploetzblog.de"

def author(self):
return self._get_script_string_field("authorName")

def title(self):
return self.soup.find("h1").text

def category(self):
return self.schema.category()

def total_time(self):
# Could also be scraped manually from the page text
# Issue is that the time units are in German, which get_minutes does not work for
return self._get_script_number_field("preparationTime")

def yields(self):
count_input = self.soup.find("input", {"id": "recipePieceCount"})
count = count_input.get("value")

unit_td = count_input.parent.find_next_sibling("td")
unit = normalize_string(unit_td.text)

return f"{count} {unit}"

def image(self):
return self.schema.image()

def ingredients(self):
ingredients_div = self.soup.find(
"div", {"class": "we2p-pb-recipe__ingredients"}
)
ingredients_table = ingredients_div.find_all("table")[1]
return self._get_ingredients_from_table(ingredients_table)

def ingredient_groups(self) -> List[IngredientGroup]:
ingredient_groups = []

group_divs = self.soup.find_all(
"div", {"class": "module-mb-4 vg-wort-text module-break-inside-avoid"}
)
for group_div in group_divs:
h4 = group_div.find("h4")
purpose = normalize_string(h4.text)

ingredients_table = group_div.find("table")
ingredients = self._get_ingredients_from_table(ingredients_table)

ingredient_groups.append(IngredientGroup(ingredients, purpose=purpose))

return ingredient_groups

def instructions(self):
instruction_ps = self.soup.find_all(
"p", {"class": "module-float-left module-my-auto we2p-autolinker"}
)
instructions = [
normalize_string(instruction.text) for instruction in instruction_ps
]
return "\n".join(instructions[:2])

def ratings(self):
return self.schema.ratings()

def cuisine(self):
return self.schema.cuisine()

def description(self):
description_div = self.soup.find(
"div", {"class": "we2p-pb-recipe__description"}
)

lines = []
for p in description_div.find_all("p"):
lines.append(normalize_string(p.text))

return "\n".join(lines)

def site_name(self):
return "Plötzblog"

def _get_ingredients_from_table(self, ingredients_table):
ingredients = []

tr_list = ingredients_table.find_all("tr")
for tr in tr_list:
line = []
td_list = tr.find_all("td", limit=2)
for td in td_list:
span_list = td.find_all("span")
for span in span_list:
text = normalize_string(span.text)
if text:
line.append(text)
ingredients.append(" ".join(line))

return ingredients

def _get_script(self):
main = self.soup.find("main", {"id": "main-content"})
script = main.find(
"script", string=re.compile(r'"types":\["ForumPost","Recipe"\]')
)
return script

def _get_field_name_pattern(self, field_name):
return f'\\"{field_name}\\"\\s*:\\s*'

def _get_script_string_field(self, field_name):
script = self._get_script()

result = re.search(
self._get_field_name_pattern(field_name) + '\\"([^"]+)', script.string
)
if not result:
return None

return result.group(1)

def _get_script_number_field(self, field_name):
script = self._get_script()

result = re.search(
self._get_field_name_pattern(field_name) + "([^,]+)", script.string
)
if not result:
return None

return int(result.group(1))
77 changes: 8 additions & 69 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,8 @@
import json
import pathlib
import unittest
from typing import Callable

from recipe_scrapers import scrape_html
from recipe_scrapers._grouping_utils import IngredientGroup

MANDATORY_TESTS = [
"author",
"canonical_url",
"host",
"description",
"image",
"ingredients",
"ingredient_groups",
"instructions",
"instructions_list",
"language",
"site_name",
"title",
"total_time",
"yields",
]

OPTIONAL_TESTS = [
"category",
"cook_time",
"cuisine",
"nutrients",
"prep_time",
"cooking_method",
"ratings",
"reviews",
"equipment",
]
from .data_utils import load_test, run_mandatory_tests, run_optional_test


class RecipeTestCase(unittest.TestCase):
Expand Down Expand Up @@ -72,43 +41,10 @@ def test_func_factory(
"""

def test_func(self):
with open(testjson, encoding="utf-8") as f:
expect = json.load(f)
expect["ingredient_groups"] = [
IngredientGroup(**group)
for group in expect.get("ingredient_groups", [])
]
actual = scrape_html(testhtml.read_text(encoding="utf-8"), host)

# Mandatory tests
# If the key isn't present, check an assertion is raised
for key in MANDATORY_TESTS:
with self.subTest(key):
scraper_func = getattr(actual, key)
if key in expect.keys():
self.assertEqual(
expect[key],
scraper_func(),
msg=f"The actual value for .{key}() did not match the expected value.",
)
else:
with self.assertRaises(
Exception,
msg=f".{key}() was expected to raise an exception but it did not.",
):
scraper_func()

# Optional tests
# If the key isn't present, skip
for key in OPTIONAL_TESTS:
with self.subTest(key):
scraper_func = getattr(actual, key)
if key in expect.keys():
self.assertEqual(
expect[key],
scraper_func(),
msg=f"The actual value for .{key}() did not match the expected value.",
)
expect, actual = load_test(host, testhtml, testjson)

run_mandatory_tests(self, expect, actual)
run_optional_test(self, expect, actual)

# Assert that the ingredients returned by the ingredient_groups() function
# are the same as the ingredients return by the ingredients() function.
Expand Down Expand Up @@ -176,6 +112,9 @@ def load_tests(
tests = loader.loadTestsFromTestCase(RecipeTestCase)
suite.addTest(tests)

data_driven_tests = loader.discover("tests/data_driven")
suite.addTests(data_driven_tests)

# Add library tests to test suite
library_tests = loader.discover("tests/library")
suite.addTests(library_tests)
Expand Down
Empty file added tests/data_driven/__init__.py
Empty file.
42 changes: 42 additions & 0 deletions tests/data_driven/test_data/ploetzblog.de/ploetzblog.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"author": "Lutz Gei\\u00dfler",
"canonical_url": "ploetzblog.de",
"host": "ploetzblog.de",
"description": "Mein bislang bestes Weizensauerteigbrot, ganz ohne Backhefe.\nGrobe bis mittlere, unregelmäßige Porung, wattige Krume und kaum spürbare, milde Säure. Der Teigling bekommt eine lange kalte Stückgare und entwickelt auch deshalb seinen wilden Trieb im Gusseisentopf.\nFür etwas mehr Charakter kann der Sauerteig mit Vollkornmehl angesetzt werden.\nWichtig ist das triebstarke und aktive Anstellgut, das 2 – 3 Mal vor dem Ansetzen des Sauerteiges bei 27 – 28 °C aufgefrischt werden sollte.\nHinweis: Wahlweise kann das Brot auch im auf 250 °C aufgeheizten Gusseisentopf 50 Minuten fallend auf 220 °C gebacken werden. Dann den Deckel nach 40 Minuten abnehmen.",
"image": "https://webimages.we2p.de/2/ploetzblog/entity/gallery/619f68b528ae7154616ab768/Mildes_Weizensauerteigbrot_20160506.jpg",
"ingredients": [
"558 g Weizenmehl 550",
"389 g Wasser",
"90 g Weizenanstellgut TA 200 (weich)",
"13 g Salz"
],
"ingredient_groups": [
{
"ingredients": [
"90 g Wasser",
"90 g Weizenmehl 550",
"90 g Weizenanstellgut TA 200 (weich)"
],
"purpose": "Weizensauerteig"
},
{
"ingredients": [
"13 g Salz",
"298 g Wasser",
"467 g Weizenmehl 550",
"gesamter Weizensauerteig"
],
"purpose": "Hauptteig"
}
],
"instructions": "Die Zutaten in der genannten Reihenfolge in eine Schüssel wiegen.\nMischen, bis sich die Zutaten zu einem weichen Teig verbunden haben (gewünschte Teigtemperatur: ca. 28 °C).",
"instructions_list": [
"Die Zutaten in der genannten Reihenfolge in eine Schüssel wiegen.",
"Mischen, bis sich die Zutaten zu einem weichen Teig verbunden haben (gewünschte Teigtemperatur: ca. 28 °C)."
],
"language": "de",
"site_name": "Plötzblog",
"title": "Mildes Weizensauerteigbrot",
"total_time": 982,
"yields": "1 Stück zu (je) ca. 1050 g"
}