Skip to content

Commit

Permalink
Ploetzblog and test configurability
Browse files Browse the repository at this point in the history
  • Loading branch information
mlduff committed Apr 27, 2024
1 parent 4a11ec8 commit 9790ad7
Show file tree
Hide file tree
Showing 6 changed files with 3,227 additions and 7 deletions.
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ Scrapers available for:
- `https://www.pingodoce.pt/ <https://www.pingodoce.pt>`_
- `https://pinkowlkitchen.com/ <https://pinkowlkitchen.com/>`_
- `https://www.platingpixels.com/ <https://www.platingpixels.com/>`_
- `https://www.ploetzblog.de/ <https://www.ploetzblog.de/>`_
- `https://plowingthroughlife.com/ <https://plowingthroughlife.com/>`_
- `https://popsugar.com/ <https://popsugar.com>`_
- `https://practicalselfreliance.com/ <https://practicalselfreliance.com>`_
Expand Down
2 changes: 2 additions & 0 deletions recipe_scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@
from .pingodoce import PingoDoce
from .pinkowlkitchen import PinkOwlKitchen
from .platingpixels import PlatingPixels
from .ploetzblog import Ploetzblog
from .plowingthroughlife import PlowingThroughLife
from .popsugar import PopSugar
from .practicalselfreliance import PracticalSelfReliance
Expand Down Expand Up @@ -388,6 +389,7 @@
MyJewishLearning.host(): MyJewishLearning,
NutritionFacts.host(): NutritionFacts,
PinchOfYum.host(): PinchOfYum,
Ploetzblog.host(): Ploetzblog,
Recept.host(): Recept,
RicettePerBimby.host(): RicettePerBimby,
StrongrFastr.host(): StrongrFastr,
Expand Down
142 changes: 142 additions & 0 deletions recipe_scrapers/ploetzblog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# mypy: allow-untyped-defs

import re
from typing import List

from ._abstract import AbstractScraper
from ._grouping_utils import IngredientGroup
from ._utils import normalize_string


class Ploetzblog(AbstractScraper):
@classmethod
def host(cls):
return "ploetzblog.de"

def author(self):
return self._get_script_string_field("authorName")

def title(self):
return self.soup.find("h1").text

def category(self):
return self.schema.category()

def total_time(self):
# Could also be scraped manually from the page text
# Issue is that the time units are in German, which get_minutes does not work for
return self._get_script_number_field("preparationTime")

def yields(self):
count_input = self.soup.find("input", {"id": "recipePieceCount"})
count = count_input.get("value")

unit_td = count_input.parent.find_next_sibling("td")
unit = normalize_string(unit_td.text)

return f"{count} {unit}"

def image(self):
return self.schema.image()

def ingredients(self):
ingredients_div = self.soup.find(
"div", {"class": "we2p-pb-recipe__ingredients"}
)
ingredients_table = ingredients_div.find_all("table")[1]
return self._get_ingredients_from_table(ingredients_table)

def ingredient_groups(self) -> List[IngredientGroup]:
ingredient_groups = []

group_divs = self.soup.find_all(
"div", {"class": "module-mb-4 vg-wort-text module-break-inside-avoid"}
)
for group_div in group_divs:
h4 = group_div.find("h4")
purpose = normalize_string(h4.text)

ingredients_table = group_div.find("table")
ingredients = self._get_ingredients_from_table(ingredients_table)

ingredient_groups.append(IngredientGroup(ingredients, purpose=purpose))

return ingredient_groups

def instructions(self):
instruction_ps = self.soup.find_all(
"p", {"class": "module-float-left module-my-auto we2p-autolinker"}
)
instructions = [
normalize_string(instruction.text) for instruction in instruction_ps
]
return "\n".join(instructions[:2])

def ratings(self):
return self.schema.ratings()

def cuisine(self):
return self.schema.cuisine()

def description(self):
description_div = self.soup.find(
"div", {"class": "we2p-pb-recipe__description"}
)

lines = []
for p in description_div.find_all("p"):
lines.append(normalize_string(p.text))

return "\n".join(lines)

def site_name(self):
return "Plötzblog"

def _get_ingredients_from_table(self, ingredients_table):
ingredients = []

tr_list = ingredients_table.find_all("tr")
for tr in tr_list:
line = []
td_list = tr.find_all("td", limit=2)
for td in td_list:
span_list = td.find_all("span")
for span in span_list:
text = normalize_string(span.text)
if text:
line.append(text)
ingredients.append(" ".join(line))

return ingredients

def _get_script(self):
main = self.soup.find("main", {"id": "main-content"})
script = main.find(
"script", string=re.compile(r'"types":\["ForumPost","Recipe"\]')
)
return script

def _get_field_name_pattern(self, field_name):
return f'\\"{field_name}\\"\\s*:\\s*'

def _get_script_string_field(self, field_name):
script = self._get_script()

result = re.search(
self._get_field_name_pattern(field_name) + '\\"([^"]+)', script.string
)
if not result:
return None

return result.group(1)

def _get_script_number_field(self, field_name):
script = self._get_script()

result = re.search(
self._get_field_name_pattern(field_name) + "([^,]+)", script.string
)
if not result:
return None

return int(result.group(1))
45 changes: 38 additions & 7 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import pathlib
import unittest
from enum import Enum
from typing import Callable

from recipe_scrapers import scrape_html
Expand Down Expand Up @@ -35,6 +36,33 @@
"equipment",
]

OPTIONS_KEY = "_options"


class TestOptions(Enum):
CONSISTENT_INGREDIENT_GROUPS = ("consistent_ingredient_groups", True)
"""
Controls if the consistent ingredient groups test is run.
Disable if ingredient groups contain sub-quantities of the same ingredient (as the test will fail).
"""

def __new__(cls, value: str, default):
obj = object.__new__(cls)
obj._value_ = value
return obj

def __init__(self, value: str, default: str) -> None:
self.default = default


def get_options(expect):
options = {}
for option in TestOptions:
# Checks if the option has been set in the test
# Tolerates both the options node and the specific option not being defined
options[option] = expect.get(OPTIONS_KEY, {}).get(option.value, option.default)
return options


class RecipeTestCase(unittest.TestCase):
maxDiff = None
Expand Down Expand Up @@ -80,6 +108,8 @@ def test_func(self):
]
actual = scrape_html(testhtml.read_text(encoding="utf-8"), host)

options = get_options(expect)

# Mandatory tests
# If the key isn't present, check an assertion is raised
for key in MANDATORY_TESTS:
Expand Down Expand Up @@ -110,14 +140,15 @@ def test_func(self):
msg=f"The actual value for .{key}() did not match the expected value.",
)

# Assert that the ingredients returned by the ingredient_groups() function
# are the same as the ingredients return by the ingredients() function.
grouped = []
for group in actual.ingredient_groups():
grouped.extend(group.ingredients)
if options.get(TestOptions.CONSISTENT_INGREDIENT_GROUPS):
# Assert that the ingredients returned by the ingredient_groups() function
# are the same as the ingredients return by the ingredients() function.
grouped = []
for group in actual.ingredient_groups():
grouped.extend(group.ingredients)

with self.subTest("ingredient_groups"):
self.assertEqual(sorted(actual.ingredients()), sorted(grouped))
with self.subTest("ingredient_groups"):
self.assertEqual(sorted(actual.ingredients()), sorted(grouped))

return test_func

Expand Down
45 changes: 45 additions & 0 deletions tests/test_data/ploetzblog.de/ploetzblog.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"author": "Lutz Gei\\u00dfler",
"canonical_url": "ploetzblog.de",
"host": "ploetzblog.de",
"description": "Mein bislang bestes Weizensauerteigbrot, ganz ohne Backhefe.\nGrobe bis mittlere, unregelmäßige Porung, wattige Krume und kaum spürbare, milde Säure. Der Teigling bekommt eine lange kalte Stückgare und entwickelt auch deshalb seinen wilden Trieb im Gusseisentopf.\nFür etwas mehr Charakter kann der Sauerteig mit Vollkornmehl angesetzt werden.\nWichtig ist das triebstarke und aktive Anstellgut, das 2 – 3 Mal vor dem Ansetzen des Sauerteiges bei 27 – 28 °C aufgefrischt werden sollte.\nHinweis: Wahlweise kann das Brot auch im auf 250 °C aufgeheizten Gusseisentopf 50 Minuten fallend auf 220 °C gebacken werden. Dann den Deckel nach 40 Minuten abnehmen.",
"image": "https://webimages.we2p.de/2/ploetzblog/entity/gallery/619f68b528ae7154616ab768/Mildes_Weizensauerteigbrot_20160506.jpg",
"ingredients": [
"558 g Weizenmehl 550",
"389 g Wasser",
"90 g Weizenanstellgut TA 200 (weich)",
"13 g Salz"
],
"ingredient_groups": [
{
"ingredients": [
"90 g Wasser",
"90 g Weizenmehl 550",
"90 g Weizenanstellgut TA 200 (weich)"
],
"purpose": "Weizensauerteig"
},
{
"ingredients": [
"13 g Salz",
"298 g Wasser",
"467 g Weizenmehl 550",
"gesamter Weizensauerteig"
],
"purpose": "Hauptteig"
}
],
"instructions": "Die Zutaten in der genannten Reihenfolge in eine Schüssel wiegen.\nMischen, bis sich die Zutaten zu einem weichen Teig verbunden haben (gewünschte Teigtemperatur: ca. 28 °C).",
"instructions_list": [
"Die Zutaten in der genannten Reihenfolge in eine Schüssel wiegen.",
"Mischen, bis sich die Zutaten zu einem weichen Teig verbunden haben (gewünschte Teigtemperatur: ca. 28 °C)."
],
"language": "de",
"site_name": "Plötzblog",
"title": "Mildes Weizensauerteigbrot",
"total_time": 982,
"yields": "1 Stück zu (je) ca. 1050 g",
"_options": {
"consistent_ingredient_groups": false
}
}

0 comments on commit 9790ad7

Please sign in to comment.