aHardReset · aHardReset · Nov 23, 2022 · Nov 23, 2022 · Nov 24, 2022 · Nov 24, 2022
diff --git a/.gitignore b/.gitignore
@@ -1 +1,4 @@
 .venv
+.vscode
+__pycache__
+.pytest_cache
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,8 @@
+FROM tiangolo/uvicorn-gunicorn:python3.9
+
+LABEL maintainer = "Aaron Garibay <aaron.contreras@unosquare.com>"
+
+COPY requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
+
+COPY ./html_utils /app
diff --git a/main.py → html_utils/__init__.py b/main.py → html_utils/__init__.py
diff --git a/html_utils/main.py b/html_utils/main.py
@@ -0,0 +1,42 @@
+from typing import Optional
+
+from utils import get_title, get_favicon_url, get_first_h1_in_body, get_meta_name
+from scraping import do_get_request, get_soup_for_html
+
+import uvicorn
+from fastapi import FastAPI, status
+from pydantic import BaseModel, HttpUrl
+
+app = FastAPI()
+
+# models
+
+class HTMLBaseInfo(BaseModel):
+    title: Optional[str]
+    metaName: Optional[str]
+    faviconUrl: Optional[str]
+    firstH1: Optional[str]
+
+# routes
+
+@app.get(
+    path="/v1/get-html-base-info",
+    summary="Get HTML base info that includes title, meta name, favicon url and first h1",
+    description="Get HTML base info that includes title, meta name, favicon url and first h1",
+    tags=["html", "scraping"],
+    status_code=status.HTTP_200_OK,
+    response_model=HTMLBaseInfo,
+)
+def get_html_base_info(url: HttpUrl):
+    response = do_get_request(url)
+    soup = get_soup_for_html(response.content)
+    new_base_info = HTMLBaseInfo(
+        title=get_title(soup),
+        metaName=get_meta_name(soup),
+        faviconUrl=get_favicon_url(soup),
+        firstH1=get_first_h1_in_body(soup),
+    )
+    return new_base_info
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/html_utils/scraping.py b/html_utils/scraping.py
@@ -0,0 +1,11 @@
+import requests
+from bs4 import BeautifulSoup
+
+HTML_PARSER = "html.parser"
+
+def do_get_request(url: str) -> requests.Response:
+    response = requests.get(url)
+    return response
+
+def get_soup_for_html(html_payload: str or bytes, html_parser: str = HTML_PARSER) -> BeautifulSoup:
+    return BeautifulSoup(html_payload, html_parser)
diff --git a/html_utils/utils.py b/html_utils/utils.py
@@ -0,0 +1,21 @@
+from bs4 import BeautifulSoup
+
+def get_title(soup: BeautifulSoup):
+    # return the title of the page of empty string if not found
+    title = soup.find("title")
+    return title.string if title else None
+
+def get_meta_name(soup: BeautifulSoup):
+    # finds the meta tag and returns the content of the name attribute or empty string if not found
+    meta_name = soup.find("meta", {"name": "description"})
+    return meta_name["content"] if meta_name else None
+
+def get_favicon_url(soup: BeautifulSoup):
+    # finds the favicon url or empty string if not found
+    favicon_url = soup.find("link", {"rel": "icon"})
+    return favicon_url["href"] if favicon_url else None
+
+def get_first_h1_in_body(soup: BeautifulSoup):
+    # finds the first h1 in the body or empty string if not found
+    first_h1 = soup.find("body").find("h1")
+    return first_h1.string if first_h1 else None
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+pythonpath = . html_utils
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,27 @@
+anyio==3.6.2
+attrs==22.1.0
+beautifulsoup4==4.11.1
+bs4==0.0.1
+certifi==2022.9.24
+charset-normalizer==2.1.1
+click==8.1.3
+colorama==0.4.6
+fastapi==0.87.0
+h11==0.14.0
+httpcore==0.16.1
+httpx==0.23.1
+idna==3.4
+iniconfig==1.1.1
+packaging==21.3
+pluggy==1.0.0
+pydantic==1.10.2
+pyparsing==3.0.9
+pytest==7.2.0
+requests==2.28.1
+rfc3986==1.5.0
+sniffio==1.3.0
+soupsieve==2.3.2.post1
+starlette==0.21.0
+typing_extensions==4.4.0
+urllib3==1.26.12
+uvicorn==0.20.0
diff --git a/tests/html_snapshots/pydantic.html b/tests/html_snapshots/pydantic.html
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -0,0 +1,36 @@
+from html_utils import main, utils, scraping
+from fastapi.testclient import TestClient
+
+def get_mocked_payload(): 
+    with open("tests/html_snapshots/pydantic.html") as f:
+        content = f.read()
+    return content
+
+
+client = TestClient(main.app)
+# create a test class
+class TestHtmlBaseInfo:
+
+    def test_get_html_base_info_utils(self):
+        """
+        Test the functions in utils.py with a pre defined html payload
+        """
+
+        soup = scraping.get_soup_for_html(get_mocked_payload())
+
+        assert '../../favicon.png' == utils.get_favicon_url(soup)
+        assert 'Models' in utils.get_title(soup)
+        assert 'Data validation' in utils.get_meta_name(soup)
+        assert 'Models' == utils.get_first_h1_in_body(soup)
+
+    def test_get_html_base_info(self, monkeypatch):
+        """
+        Test the html_get_base_info function with a pre defined html payload
+        """
+        monkeypatch.setattr(scraping, "do_get_request", lambda url: get_mocked_payload())
+        html_info = client.get("/v1/get-html-base-info?url=https://pydantic-docs.helpmanual.io/usage/models/")
+        html_info = html_info.json()
+        assert '../../favicon.png' == html_info.get('faviconUrl')
+        assert 'Models' in html_info.get('title')
+        assert 'Data validation' in html_info.get('metaName')
+        assert 'Models' == html_info.get('firstH1')