-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Html info core #1
base: main
Are you sure you want to change the base?
Changes from all commits
3718a03
101255a
5070e01
ac16ebf
fb3aaa4
7fd10bf
57829eb
c541e20
92839ea
56e83c8
3dac2fa
9fcc118
a166610
d6b8352
06d54ae
2246e06
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,4 @@ | ||
.venv | ||
.vscode | ||
__pycache__ | ||
.pytest_cache |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
repos: | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v2.3.0 | ||
hooks: | ||
- id: check-yaml | ||
- id: end-of-file-fixer | ||
- id: trailing-whitespace | ||
- repo: https://github.com/asottile/reorder_python_imports | ||
rev: v3.9.0 | ||
hooks: | ||
- id: reorder-python-imports | ||
- repo: https://github.com/psf/black | ||
rev: "22.12.0" | ||
hooks: | ||
- id: black | ||
- repo: https://github.com/PyCQA/flake8 | ||
rev: 6.0.0 | ||
hooks: | ||
- id: flake8 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
FROM python:3.9-slim | ||
LABEL maintainer = "Aaron Garibay <aaron.contreras@unosquare.com>" | ||
|
||
# Configure Poetry | ||
ENV POETRY_VERSION=1.2.2 \ | ||
PYTHONFAULTHANDLER=1 \ | ||
PYTHONUNBUFFERED=1 \ | ||
PYTHONHASHSEED=random \ | ||
PIP_NO_CACHE_DIR=off \ | ||
PIP_DISABLE_PIP_VERSION_CHECK=on \ | ||
PIP_DEFAULT_TIMEOUT=100 \ | ||
ENVIRONMENT=production | ||
|
||
RUN pip install "poetry==$POETRY_VERSION" \ | ||
&& pip install "uvicorn[standard]==0.20.0" \ | ||
&& pip install "gunicorn==20.1.0" | ||
|
||
WORKDIR / | ||
COPY poetry.lock pyproject.toml / | ||
|
||
RUN poetry config virtualenvs.create false \ | ||
&& poetry install $(test "$YOUR_ENV" == production && echo "--no-dev") --no-interaction --no-ansi | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add as env vars --no-interaction and --no-ansi: POETRY_NO_ANSI and POETRY_NO_INTERACTION There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Checkout docker multi-stage builds for prod and dev environments. |
||
|
||
COPY html_utils /html_utils | ||
|
||
CMD ["poetry", "run", "uvicorn", "main:app", "--app-dir", "html_utils/", "--port", "80", "--host", "0.0.0.0"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need to call |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from typing import Optional | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
from fastapi import FastAPI | ||
from fastapi import status | ||
from pydantic import BaseModel | ||
from pydantic import HttpUrl | ||
from utils import get_favicon_url | ||
from utils import get_first_h1_in_body | ||
from utils import get_meta_name | ||
from utils import get_title | ||
|
||
app = FastAPI() | ||
HTML_PARSER = "html.parser" | ||
|
||
# models | ||
|
||
|
||
class HTMLBaseInfo(BaseModel): | ||
title: Optional[str] | ||
metaName: Optional[str] | ||
faviconUrl: Optional[str] | ||
firstH1: Optional[str] | ||
|
||
|
||
# routes | ||
|
||
|
||
@app.get( | ||
path="/v1/get-html-base-info", | ||
summary="""Get HTML base info that includes title, | ||
meta name, favicon url and first h1""", | ||
description="""Get HTML base info that includes title, | ||
meta name, favicon url and first h1""", | ||
tags=["html", "scraping"], | ||
status_code=status.HTTP_200_OK, | ||
response_model=HTMLBaseInfo, | ||
) | ||
def get_html_base_info(url: HttpUrl): | ||
"""Get HTML base info from a web page that includes | ||
title, meta name, favicon url and first h1 | ||
|
||
Args: | ||
url (HttpUrl): the url of the web page | ||
|
||
Returns: | ||
HTMLBaseInfo: the base info of the web page | ||
""" | ||
|
||
content = requests.get(url).content | ||
soup = BeautifulSoup(content, HTML_PARSER) | ||
new_base_info = HTMLBaseInfo( | ||
title=get_title(soup), | ||
metaName=get_meta_name(soup), | ||
faviconUrl=get_favicon_url(soup), | ||
firstH1=get_first_h1_in_body(soup), | ||
) | ||
return new_base_info |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
from bs4 import BeautifulSoup | ||
from bs4 import element | ||
|
||
|
||
def get_title(soup: BeautifulSoup) -> (None | str): | ||
"""search the title tag of a web page | ||
|
||
Args: | ||
soup (BeautifulSoup): the soup object of the web page | ||
|
||
Returns: | ||
str or None: the title of the web page or None if not found | ||
""" | ||
title = soup.find("title") | ||
if type(title) is element.Tag: | ||
return title.string | ||
return None | ||
|
||
|
||
def get_meta_name(soup: BeautifulSoup) -> (str | None): | ||
"""search the meta tag of a web page and returns the content of the | ||
name attribute | ||
|
||
Args: | ||
soup (BeautifulSoup): the soup object of the web page | ||
|
||
Returns: | ||
str or None: the content of the name attribute or None if not found | ||
""" | ||
|
||
meta_name = soup.find("meta", {"name": "description"}) or {} | ||
if type(meta_name) is element.Tag: | ||
content = meta_name.get("content", None) | ||
if content: | ||
return str(content) | ||
return None | ||
|
||
|
||
def get_favicon_url(soup: BeautifulSoup) -> (str | None): | ||
"""search the favicon url of a web page | ||
|
||
Args: | ||
soup (BeautifulSoup): the soup object of the web page | ||
|
||
Returns: | ||
str or None: the favicon url of the web page or None if not found | ||
""" | ||
|
||
favicon = soup.find("link", {"rel": "icon"}) or {} | ||
if type(favicon) is element.Tag: | ||
favicon_url = favicon.get("href", None) | ||
if favicon_url: | ||
return str(favicon_url) | ||
return None | ||
|
||
|
||
def get_first_h1_in_body(soup: BeautifulSoup) -> (str | None): | ||
"""search the first h1 in the body of a web page | ||
|
||
Args: | ||
soup (BeautifulSoup): the soup object of the web page | ||
|
||
Returns: | ||
str or None: the first h1 in the body of the web page | ||
or None if not found | ||
""" | ||
body = soup.find("body") | ||
if type(body) is element.Tag: | ||
h1 = body.find("h1") | ||
|
||
if type(h1) is element.Tag: | ||
return h1.string | ||
return None |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
set config as env var:
POETRY_VIRTUAENVS_CREATE = 0