aHardReset · aHardReset · Nov 23, 2022 · Nov 23, 2022 · Nov 24, 2022 · Nov 24, 2022
diff --git a/.gitignore b/.gitignore
@@ -1 +1,4 @@
 .venv
+.vscode
+__pycache__
+.pytest_cache
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,19 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+-   repo: https://github.com/asottile/reorder_python_imports
+    rev: v3.9.0
+    hooks:
+    -   id: reorder-python-imports
+-   repo: https://github.com/psf/black
+    rev: "22.12.0"
+    hooks:
+    -   id: black
+-   repo: https://github.com/PyCQA/flake8
+    rev: 6.0.0
+    hooks:
+    -   id: flake8
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,26 @@
+FROM python:3.9-slim
+LABEL maintainer = "Aaron Garibay <aaron.contreras@unosquare.com>"
+
+# Configure Poetry
+ENV POETRY_VERSION=1.2.2 \
+    PYTHONFAULTHANDLER=1 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONHASHSEED=random \
+    PIP_NO_CACHE_DIR=off \
+    PIP_DISABLE_PIP_VERSION_CHECK=on \
+    PIP_DEFAULT_TIMEOUT=100 \
+    ENVIRONMENT=production
+
+RUN pip install "poetry==$POETRY_VERSION" \
+    && pip install "uvicorn[standard]==0.20.0" \
+    && pip install "gunicorn==20.1.0"
+
+WORKDIR /
+COPY poetry.lock pyproject.toml /
+
+RUN poetry config virtualenvs.create false \
+  && poetry install $(test "$YOUR_ENV" == production && echo "--no-dev") --no-interaction --no-ansi
+
+COPY html_utils /html_utils
+
+CMD ["poetry", "run", "uvicorn", "main:app", "--app-dir", "html_utils/", "--port", "80", "--host", "0.0.0.0"]
diff --git a/main.py → html_utils/__init__.py b/main.py → html_utils/__init__.py
diff --git a/html_utils/main.py b/html_utils/main.py
@@ -0,0 +1,59 @@
+from typing import Optional
+
+import requests
+from bs4 import BeautifulSoup
+from fastapi import FastAPI
+from fastapi import status
+from pydantic import BaseModel
+from pydantic import HttpUrl
+from utils import get_favicon_url
+from utils import get_first_h1_in_body
+from utils import get_meta_name
+from utils import get_title
+
+app = FastAPI()
+HTML_PARSER = "html.parser"
+
+# models
+
+
+class HTMLBaseInfo(BaseModel):
+    title: Optional[str]
+    metaName: Optional[str]
+    faviconUrl: Optional[str]
+    firstH1: Optional[str]
+
+
+# routes
+
+
+@app.get(
+    path="/v1/get-html-base-info",
+    summary="""Get HTML base info that includes title,
+        meta name, favicon url and first h1""",
+    description="""Get HTML base info that includes title,
+        meta name, favicon url and first h1""",
+    tags=["html", "scraping"],
+    status_code=status.HTTP_200_OK,
+    response_model=HTMLBaseInfo,
+)
+def get_html_base_info(url: HttpUrl):
+    """Get HTML base info from a web page that includes
+    title, meta name, favicon url and first h1
+
+    Args:
+        url (HttpUrl): the url of the web page
+
+    Returns:
+        HTMLBaseInfo: the base info of the web page
+    """
+
+    content = requests.get(url).content
+    soup = BeautifulSoup(content, HTML_PARSER)
+    new_base_info = HTMLBaseInfo(
+        title=get_title(soup),
+        metaName=get_meta_name(soup),
+        faviconUrl=get_favicon_url(soup),
+        firstH1=get_first_h1_in_body(soup),
+    )
+    return new_base_info
diff --git a/html_utils/utils.py b/html_utils/utils.py
@@ -0,0 +1,73 @@
+from bs4 import BeautifulSoup
+from bs4 import element
+
+
+def get_title(soup: BeautifulSoup) -> (None | str):
+    """search the title tag of a web page
+
+    Args:
+        soup (BeautifulSoup): the soup object of the web page
+
+    Returns:
+        str or None: the title of the web page or None if not found
+    """
+    title = soup.find("title")
+    if type(title) is element.Tag:
+        return title.string
+    return None
+
+
+def get_meta_name(soup: BeautifulSoup) -> (str | None):
+    """search the meta tag of a web page and returns the content of the
+    name attribute
+
+    Args:
+        soup (BeautifulSoup): the soup object of the web page
+
+    Returns:
+        str or None: the content of the name attribute or None if not found
+    """
+
+    meta_name = soup.find("meta", {"name": "description"}) or {}
+    if type(meta_name) is element.Tag:
+        content = meta_name.get("content", None)
+        if content:
+            return str(content)
+    return None
+
+
+def get_favicon_url(soup: BeautifulSoup) -> (str | None):
+    """search the favicon url of a web page
+
+    Args:
+        soup (BeautifulSoup): the soup object of the web page
+
+    Returns:
+        str or None: the favicon url of the web page or None if not found
+    """
+
+    favicon = soup.find("link", {"rel": "icon"}) or {}
+    if type(favicon) is element.Tag:
+        favicon_url = favicon.get("href", None)
+        if favicon_url:
+            return str(favicon_url)
+    return None
+
+
+def get_first_h1_in_body(soup: BeautifulSoup) -> (str | None):
+    """search the first h1 in the body of a web page
+
+    Args:
+        soup (BeautifulSoup): the soup object of the web page
+
+    Returns:
+        str or None: the first h1 in the body of the web page
+        or None if not found
+    """
+    body = soup.find("body")
+    if type(body) is element.Tag:
+        h1 = body.find("h1")
+
+        if type(h1) is element.Tag:
+            return h1.string
+    return None