Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Html info core #1

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
.venv
.vscode
__pycache__
.pytest_cache
19 changes: 19 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.3.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/asottile/reorder_python_imports
rev: v3.9.0
hooks:
- id: reorder-python-imports
- repo: https://github.com/psf/black
rev: "22.12.0"
hooks:
- id: black
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
hooks:
- id: flake8
26 changes: 26 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
FROM python:3.9-slim
LABEL maintainer = "Aaron Garibay <aaron.contreras@unosquare.com>"

# Configure Poetry
ENV POETRY_VERSION=1.2.2 \
PYTHONFAULTHANDLER=1 \
PYTHONUNBUFFERED=1 \
PYTHONHASHSEED=random \
PIP_NO_CACHE_DIR=off \
PIP_DISABLE_PIP_VERSION_CHECK=on \
PIP_DEFAULT_TIMEOUT=100 \
ENVIRONMENT=production

RUN pip install "poetry==$POETRY_VERSION" \
&& pip install "uvicorn[standard]==0.20.0" \
&& pip install "gunicorn==20.1.0"

WORKDIR /
COPY poetry.lock pyproject.toml /

RUN poetry config virtualenvs.create false \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

set config as env var: POETRY_VIRTUAENVS_CREATE = 0

&& poetry install $(test "$YOUR_ENV" == production && echo "--no-dev") --no-interaction --no-ansi
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add as env vars --no-interaction and --no-ansi: POETRY_NO_ANSI and POETRY_NO_INTERACTION

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checkout docker multi-stage builds for prod and dev environments.


COPY html_utils /html_utils

CMD ["poetry", "run", "uvicorn", "main:app", "--app-dir", "html_utils/", "--port", "80", "--host", "0.0.0.0"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to call poetry run when virtualenvs = false

File renamed without changes.
59 changes: 59 additions & 0 deletions html_utils/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from typing import Optional

import requests
from bs4 import BeautifulSoup
from fastapi import FastAPI
from fastapi import status
from pydantic import BaseModel
from pydantic import HttpUrl
from utils import get_favicon_url
from utils import get_first_h1_in_body
from utils import get_meta_name
from utils import get_title

app = FastAPI()
HTML_PARSER = "html.parser"

# models


class HTMLBaseInfo(BaseModel):
title: Optional[str]
metaName: Optional[str]
faviconUrl: Optional[str]
firstH1: Optional[str]


# routes


@app.get(
path="/v1/get-html-base-info",
summary="""Get HTML base info that includes title,
meta name, favicon url and first h1""",
description="""Get HTML base info that includes title,
meta name, favicon url and first h1""",
tags=["html", "scraping"],
status_code=status.HTTP_200_OK,
response_model=HTMLBaseInfo,
)
def get_html_base_info(url: HttpUrl):
"""Get HTML base info from a web page that includes
title, meta name, favicon url and first h1

Args:
url (HttpUrl): the url of the web page

Returns:
HTMLBaseInfo: the base info of the web page
"""

content = requests.get(url).content
soup = BeautifulSoup(content, HTML_PARSER)
new_base_info = HTMLBaseInfo(
title=get_title(soup),
metaName=get_meta_name(soup),
faviconUrl=get_favicon_url(soup),
firstH1=get_first_h1_in_body(soup),
)
return new_base_info
73 changes: 73 additions & 0 deletions html_utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from bs4 import BeautifulSoup
from bs4 import element


def get_title(soup: BeautifulSoup) -> (None | str):
"""search the title tag of a web page

Args:
soup (BeautifulSoup): the soup object of the web page

Returns:
str or None: the title of the web page or None if not found
"""
title = soup.find("title")
if type(title) is element.Tag:
return title.string
return None


def get_meta_name(soup: BeautifulSoup) -> (str | None):
"""search the meta tag of a web page and returns the content of the
name attribute

Args:
soup (BeautifulSoup): the soup object of the web page

Returns:
str or None: the content of the name attribute or None if not found
"""

meta_name = soup.find("meta", {"name": "description"}) or {}
if type(meta_name) is element.Tag:
content = meta_name.get("content", None)
if content:
return str(content)
return None


def get_favicon_url(soup: BeautifulSoup) -> (str | None):
"""search the favicon url of a web page

Args:
soup (BeautifulSoup): the soup object of the web page

Returns:
str or None: the favicon url of the web page or None if not found
"""

favicon = soup.find("link", {"rel": "icon"}) or {}
if type(favicon) is element.Tag:
favicon_url = favicon.get("href", None)
if favicon_url:
return str(favicon_url)
return None


def get_first_h1_in_body(soup: BeautifulSoup) -> (str | None):
"""search the first h1 in the body of a web page

Args:
soup (BeautifulSoup): the soup object of the web page

Returns:
str or None: the first h1 in the body of the web page
or None if not found
"""
body = soup.find("body")
if type(body) is element.Tag:
h1 = body.find("h1")

if type(h1) is element.Tag:
return h1.string
return None