-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Html info core #1
base: main
Are you sure you want to change the base?
Changes from 14 commits
3718a03
101255a
5070e01
ac16ebf
fb3aaa4
7fd10bf
57829eb
c541e20
92839ea
56e83c8
3dac2fa
9fcc118
a166610
d6b8352
06d54ae
2246e06
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,4 @@ | ||
.venv | ||
.vscode | ||
__pycache__ | ||
.pytest_cache |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
FROM python:3.9-slim | ||
LABEL maintainer = "Aaron Garibay <aaron.contreras@unosquare.com>" | ||
|
||
# Configure Poetry | ||
ENV POETRY_VERSION=1.2.2 \ | ||
PYTHONFAULTHANDLER=1 \ | ||
PYTHONUNBUFFERED=1 \ | ||
PYTHONHASHSEED=random \ | ||
PIP_NO_CACHE_DIR=off \ | ||
PIP_DISABLE_PIP_VERSION_CHECK=on \ | ||
PIP_DEFAULT_TIMEOUT=100 \ | ||
ENVIRONMENT=production | ||
|
||
RUN pip install "poetry==$POETRY_VERSION" \ | ||
&& pip install "uvicorn[standard]==0.20.0" \ | ||
&& pip install "gunicorn==20.1.0" | ||
|
||
WORKDIR / | ||
COPY poetry.lock pyproject.toml / | ||
|
||
RUN poetry config virtualenvs.create false \ | ||
&& poetry install $(test "$YOUR_ENV" == production && echo "--no-dev") --no-interaction --no-ansi | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add as env vars --no-interaction and --no-ansi: POETRY_NO_ANSI and POETRY_NO_INTERACTION There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Checkout docker multi-stage builds for prod and dev environments. |
||
|
||
COPY html_utils /html_utils | ||
|
||
CMD ["poetry", "run", "uvicorn", "main:app", "--app-dir", "html_utils/", "--port", "80", "--host", "0.0.0.0"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need to call |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from typing import Optional | ||
|
||
from utils import ( | ||
get_title, | ||
get_favicon_url, | ||
get_first_h1_in_body, | ||
get_meta_name, | ||
get_web_page_content | ||
) | ||
|
||
from fastapi import FastAPI, status | ||
from pydantic import BaseModel, HttpUrl | ||
from bs4 import BeautifulSoup | ||
|
||
app = FastAPI() | ||
HTML_PARSER = "html.parser" | ||
|
||
# models | ||
|
||
|
||
class HTMLBaseInfo(BaseModel): | ||
title: Optional[str] | ||
metaName: Optional[str] | ||
faviconUrl: Optional[str] | ||
firstH1: Optional[str] | ||
|
||
# routes | ||
|
||
|
||
@app.get( | ||
path="/v1/get-html-base-info", | ||
summary="""Get HTML base info that includes title, | ||
meta name, favicon url and first h1""", | ||
description="""Get HTML base info that includes title, | ||
meta name, favicon url and first h1""", | ||
tags=["html", "scraping"], | ||
status_code=status.HTTP_200_OK, | ||
response_model=HTMLBaseInfo, | ||
) | ||
def get_html_base_info(url: HttpUrl): | ||
"""Get HTML base info from a web page that includes | ||
title, meta name, favicon url and first h1 | ||
|
||
Args: | ||
url (HttpUrl): the url of the web page | ||
|
||
Returns: | ||
HTMLBaseInfo: the base info of the web page | ||
""" | ||
|
||
content = get_web_page_content(url) | ||
soup = BeautifulSoup(content, HTML_PARSER) | ||
new_base_info = HTMLBaseInfo( | ||
title=get_title(soup), | ||
metaName=get_meta_name(soup), | ||
faviconUrl=get_favicon_url(soup), | ||
firstH1=get_first_h1_in_body(soup), | ||
) | ||
return new_base_info |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
from bs4 import BeautifulSoup | ||
import requests | ||
|
||
|
||
def get_title(soup: BeautifulSoup) -> str or None: | ||
""" search the title tag of a web page | ||
|
||
Args: | ||
soup (BeautifulSoup): the soup object of the web page | ||
|
||
Returns: | ||
str or None: the title of the web page or None if not found | ||
""" | ||
title = soup.find("title") | ||
if not title: | ||
return None | ||
return title.string | ||
|
||
|
||
def get_meta_name(soup: BeautifulSoup) -> str or None: | ||
""" search the meta tag of a web page and returns the content of the | ||
name attribute | ||
|
||
Args: | ||
soup (BeautifulSoup): the soup object of the web page | ||
|
||
Returns: | ||
str or None: the content of the name attribute or None if not found | ||
""" | ||
|
||
meta_name = soup.find("meta", {"name": "description"}) or {} | ||
return meta_name.get("content", None) | ||
|
||
|
||
def get_favicon_url(soup: BeautifulSoup): | ||
""" search the favicon url of a web page | ||
|
||
Args: | ||
soup (BeautifulSoup): the soup object of the web page | ||
|
||
Returns: | ||
str or None: the favicon url of the web page or None if not found | ||
""" | ||
|
||
favicon_url = soup.find("link", {"rel": "icon"}) or {} | ||
return favicon_url.get("href", None) | ||
|
||
|
||
def get_first_h1_in_body(soup: BeautifulSoup): | ||
aHardReset marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" search the first h1 in the body of a web page | ||
|
||
Args: | ||
soup (BeautifulSoup): the soup object of the web page | ||
|
||
Returns: | ||
str or None: the first h1 in the body of the web page | ||
or None if not found | ||
""" | ||
first_h1 = soup.find("body").find("h1") | ||
if not first_h1: | ||
return None | ||
return first_h1.string | ||
|
||
|
||
def get_web_page_content(url: str) -> str: | ||
""" Given a url, do a get request and return the | ||
response content compatible with BeautifulSoup | ||
|
||
Args: | ||
url (str): the url of the web page | ||
|
||
Returns: | ||
str: the response content of the web page | ||
""" | ||
response = requests.get(url) | ||
return response.content |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
set config as env var:
POETRY_VIRTUAENVS_CREATE = 0