aHardReset · aHardReset · Nov 23, 2022 · Nov 23, 2022 · Nov 24, 2022 · Nov 24, 2022
diff --git a/.gitignore b/.gitignore
@@ -1 +1,4 @@
 .venv
+.vscode
+__pycache__
+.pytest_cache
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,26 @@
+FROM python:3.9-slim
+LABEL maintainer = "Aaron Garibay <aaron.contreras@unosquare.com>"
+
+# Configure Poetry
+ENV POETRY_VERSION=1.2.2 \
+    PYTHONFAULTHANDLER=1 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONHASHSEED=random \
+    PIP_NO_CACHE_DIR=off \
+    PIP_DISABLE_PIP_VERSION_CHECK=on \
+    PIP_DEFAULT_TIMEOUT=100 \
+    ENVIRONMENT=production
+
+RUN pip install "poetry==$POETRY_VERSION" \
+    && pip install "uvicorn[standard]==0.20.0" \
+    && pip install "gunicorn==20.1.0"
+
+WORKDIR /
+COPY poetry.lock pyproject.toml /
+
+RUN poetry config virtualenvs.create false \
+  && poetry install $(test "$YOUR_ENV" == production && echo "--no-dev") --no-interaction --no-ansi
+
+COPY html_utils /html_utils
+
+CMD ["poetry", "run", "uvicorn", "main:app", "--app-dir", "html_utils/", "--port", "80", "--host", "0.0.0.0"]
diff --git a/main.py → html_utils/__init__.py b/main.py → html_utils/__init__.py
diff --git a/html_utils/main.py b/html_utils/main.py
@@ -0,0 +1,59 @@
+from typing import Optional
+
+from utils import (
+    get_title,
+    get_favicon_url,
+    get_first_h1_in_body,
+    get_meta_name,
+    get_web_page_content
+)
+
+from fastapi import FastAPI, status
+from pydantic import BaseModel, HttpUrl
+from bs4 import BeautifulSoup
+
+app = FastAPI()
+HTML_PARSER = "html.parser"
+
+# models
+
+
+class HTMLBaseInfo(BaseModel):
+    title: Optional[str]
+    metaName: Optional[str]
+    faviconUrl: Optional[str]
+    firstH1: Optional[str]
+
+# routes
+
+
+@app.get(
+    path="/v1/get-html-base-info",
+    summary="""Get HTML base info that includes title,
+        meta name, favicon url and first h1""",
+    description="""Get HTML base info that includes title,
+        meta name, favicon url and first h1""",
+    tags=["html", "scraping"],
+    status_code=status.HTTP_200_OK,
+    response_model=HTMLBaseInfo,
+)
+def get_html_base_info(url: HttpUrl):
+    """Get HTML base info from a web page that includes
+    title, meta name, favicon url and first h1
+
+    Args:
+        url (HttpUrl): the url of the web page
+
+    Returns:
+        HTMLBaseInfo: the base info of the web page
+    """
+
+    content = get_web_page_content(url)
+    soup = BeautifulSoup(content, HTML_PARSER)
+    new_base_info = HTMLBaseInfo(
+        title=get_title(soup),
+        metaName=get_meta_name(soup),
+        faviconUrl=get_favicon_url(soup),
+        firstH1=get_first_h1_in_body(soup),
+    )
+    return new_base_info
diff --git a/html_utils/utils.py b/html_utils/utils.py
@@ -0,0 +1,76 @@
+from bs4 import BeautifulSoup
+import requests
+
+
+def get_title(soup: BeautifulSoup) -> str or None:
+    """ search the title tag of a web page
+
+    Args:
+        soup (BeautifulSoup): the soup object of the web page
+
+    Returns:
+        str or None: the title of the web page or None if not found
+    """
+    title = soup.find("title")
+    if not title:
+        return None
+    return title.string
+
+
+def get_meta_name(soup: BeautifulSoup) -> str or None:
+    """ search the meta tag of a web page and returns the content of the
+    name attribute
+
+    Args:
+        soup (BeautifulSoup): the soup object of the web page
+
+    Returns:
+        str or None: the content of the name attribute or None if not found
+    """
+
+    meta_name = soup.find("meta", {"name": "description"}) or {}
+    return meta_name.get("content", None)
+
+
+def get_favicon_url(soup: BeautifulSoup):
+    """ search the favicon url of a web page
+
+    Args:
+        soup (BeautifulSoup): the soup object of the web page
+
+    Returns:
+        str or None: the favicon url of the web page or None if not found
+    """
+
+    favicon_url = soup.find("link", {"rel": "icon"}) or {}
+    return favicon_url.get("href", None)
+
+
+def get_first_h1_in_body(soup: BeautifulSoup):
+    """ search the first h1 in the body of a web page
+
+    Args:
+        soup (BeautifulSoup): the soup object of the web page
+
+    Returns:
+        str or None: the first h1 in the body of the web page
+        or None if not found
+    """
+    first_h1 = soup.find("body").find("h1")
+    if not first_h1:
+        return None
+    return first_h1.string
+
+
+def get_web_page_content(url: str) -> str:
+    """ Given a url, do a get request and return the
+    response content compatible with BeautifulSoup
+
+    Args:
+        url (str): the url of the web page
+
+    Returns:
+        str: the response content of the web page
+    """
+    response = requests.get(url)
+    return response.content