Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Html info core #1

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
.venv
.vscode
__pycache__
.pytest_cache
aHardReset marked this conversation as resolved.
Show resolved Hide resolved
8 changes: 8 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM tiangolo/uvicorn-gunicorn:python3.9
aHardReset marked this conversation as resolved.
Show resolved Hide resolved

LABEL maintainer = "Aaron Garibay <aaron.contreras@unosquare.com>"

COPY requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir -r /tmp/requirements.txt

COPY ./html_utils /app
File renamed without changes.
42 changes: 42 additions & 0 deletions html_utils/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Optional

from utils import get_title, get_favicon_url, get_first_h1_in_body, get_meta_name
from scraping import do_get_request, get_soup_for_html

import uvicorn
from fastapi import FastAPI, status
from pydantic import BaseModel, HttpUrl

app = FastAPI()

# models

class HTMLBaseInfo(BaseModel):
title: Optional[str]
metaName: Optional[str]
faviconUrl: Optional[str]
firstH1: Optional[str]

# routes

@app.get(
path="/v1/get-html-base-info",
summary="Get HTML base info that includes title, meta name, favicon url and first h1",
description="Get HTML base info that includes title, meta name, favicon url and first h1",
tags=["html", "scraping"],
status_code=status.HTTP_200_OK,
response_model=HTMLBaseInfo,
)
def get_html_base_info(url: HttpUrl):
response = do_get_request(url)
soup = get_soup_for_html(response.content)
new_base_info = HTMLBaseInfo(
title=get_title(soup),
metaName=get_meta_name(soup),
faviconUrl=get_favicon_url(soup),
firstH1=get_first_h1_in_body(soup),
)
return new_base_info

if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
11 changes: 11 additions & 0 deletions html_utils/scraping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import requests
from bs4 import BeautifulSoup

HTML_PARSER = "html.parser"

def do_get_request(url: str) -> requests.Response:
aHardReset marked this conversation as resolved.
Show resolved Hide resolved
response = requests.get(url)
return response

def get_soup_for_html(html_payload: str or bytes, html_parser: str = HTML_PARSER) -> BeautifulSoup:
aHardReset marked this conversation as resolved.
Show resolved Hide resolved
return BeautifulSoup(html_payload, html_parser)
21 changes: 21 additions & 0 deletions html_utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from bs4 import BeautifulSoup

def get_title(soup: BeautifulSoup):
# return the title of the page of empty string if not found
aHardReset marked this conversation as resolved.
Show resolved Hide resolved
title = soup.find("title")
return title.string if title else None

def get_meta_name(soup: BeautifulSoup):
# finds the meta tag and returns the content of the name attribute or empty string if not found
meta_name = soup.find("meta", {"name": "description"})
return meta_name["content"] if meta_name else None
aHardReset marked this conversation as resolved.
Show resolved Hide resolved

def get_favicon_url(soup: BeautifulSoup):
# finds the favicon url or empty string if not found
favicon_url = soup.find("link", {"rel": "icon"})
return favicon_url["href"] if favicon_url else None
aHardReset marked this conversation as resolved.
Show resolved Hide resolved

def get_first_h1_in_body(soup: BeautifulSoup):
aHardReset marked this conversation as resolved.
Show resolved Hide resolved
# finds the first h1 in the body or empty string if not found
first_h1 = soup.find("body").find("h1")
return first_h1.string if first_h1 else None
2 changes: 2 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[pytest]
pythonpath = . html_utils
27 changes: 27 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
anyio==3.6.2
aHardReset marked this conversation as resolved.
Show resolved Hide resolved
attrs==22.1.0
beautifulsoup4==4.11.1
bs4==0.0.1
certifi==2022.9.24
charset-normalizer==2.1.1
click==8.1.3
colorama==0.4.6
fastapi==0.87.0
h11==0.14.0
httpcore==0.16.1
httpx==0.23.1
idna==3.4
iniconfig==1.1.1
packaging==21.3
pluggy==1.0.0
pydantic==1.10.2
pyparsing==3.0.9
pytest==7.2.0
requests==2.28.1
rfc3986==1.5.0
sniffio==1.3.0
soupsieve==2.3.2.post1
starlette==0.21.0
typing_extensions==4.4.0
urllib3==1.26.12
uvicorn==0.20.0
1 change: 1 addition & 0 deletions tests/html_snapshots/pydantic.html

Large diffs are not rendered by default.

36 changes: 36 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from html_utils import main, utils, scraping
from fastapi.testclient import TestClient

def get_mocked_payload():
with open("tests/html_snapshots/pydantic.html") as f:
content = f.read()
return content


client = TestClient(main.app)
# create a test class
class TestHtmlBaseInfo:

def test_get_html_base_info_utils(self):
"""
Test the functions in utils.py with a pre defined html payload
"""

soup = scraping.get_soup_for_html(get_mocked_payload())

assert '../../favicon.png' == utils.get_favicon_url(soup)
assert 'Models' in utils.get_title(soup)
assert 'Data validation' in utils.get_meta_name(soup)
assert 'Models' == utils.get_first_h1_in_body(soup)

def test_get_html_base_info(self, monkeypatch):
"""
Test the html_get_base_info function with a pre defined html payload
"""
monkeypatch.setattr(scraping, "do_get_request", lambda url: get_mocked_payload())
html_info = client.get("/v1/get-html-base-info?url=https://pydantic-docs.helpmanual.io/usage/models/")
html_info = html_info.json()
assert '../../favicon.png' == html_info.get('faviconUrl')
assert 'Models' in html_info.get('title')
assert 'Data validation' in html_info.get('metaName')
assert 'Models' == html_info.get('firstH1')